lumidify.org/1/git/transliterate/commit/de3bbf6acdd3d5f181717ef8b57d752d07040cb8.gph

  URI:

       tRevert changes done to split_words - transliterate - Transliteration engine
  HTML git clone git://lumidify.org/transliterate.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
   DIR commit de3bbf6acdd3d5f181717ef8b57d752d07040cb8
   DIR parent 6d2c4a738414c64f2e36968a96b2419eeded2441
  HTML Author: lumidify <nobody@lumidify.org>
       Date:   Wed,  1 Apr 2020 15:39:01 +0200
       
       Revert changes done to split_words
       
       Diffstat:
         M tests/test1/err.txt                 |       3 ++-
         M transliterate.pl                    |      76 +++++++++++++------------------
       
       2 files changed, 34 insertions(+), 45 deletions(-)
       ---
   DIR diff --git a/tests/test1/err.txt b/tests/test1/err.txt
       t@@ -1,4 +1,5 @@
       -Unknown word: "word20 word01231"
       +Unknown word: "word20"
       +Unknown word: "word01231"
        Word "word0_replaced$word0_replaced2" with 2 word choices.
        Unknown word: "aword1"
        Unknown word: "end3"
   DIR diff --git a/transliterate.pl b/transliterate.pl
       t@@ -943,35 +943,29 @@ sub handle_unknown_word_action {
        }
        
        # Split $substrings based on the "split" regex in $config.
       -# This only marks "lone" split characters or split characters at a
       -# border between transliterated and untransliterated blocks as
       -# transliterated in order to keep compound words together for 
       -# `prompt_unknown_word`.
        # $substrings can already be split at this point; only the
        # ones that haven't been transliterated yet are modified
        sub split_words {
                my ($config, $substrings) = @_;
       -        my $split_pre = qr/\A($config->{"split"})/;
       -        my $split_post = qr/($config->{"split"})\z/;
       +        my $split_re = qr/($config->{"split"})/;
                my @substrings_new;
       -        #FIXME: cleanup
                foreach my $cur_substr (@$substrings) {
                        if ($cur_substr->[0] == 1) {
                                push(@substrings_new, $cur_substr);
                                next;
                        }
       -                my $str = $cur_substr->[1];
       -                if ($str =~ /$split_pre/) {
       -                        push @substrings_new, [1, $1, $1];
       -                        $str = substr $str, length($1);
       -                }
       -                next if $str eq "";
       -                if ($str =~ /$split_post/) {
       -                        $str = substr $str, 0, -length($1);
       -                        push @substrings_new, [0, $str, $str];
       -                        push @substrings_new, [1, $1, $1];
       -                } else {
       -                        push @substrings_new, [0, $str, $str];
       +
       +                my @words = split(/$split_re/, $cur_substr->[1]);
       +                for my $i (0..$#words) {
       +                        # Word is not delimiter
       +                        # Split produces an empty field at the beginning if the string
       +                        # starts with the delimiter
       +                        if ($i % 2 == 0) {
       +                                push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
       +                        } else {
       +                                # Delimiters can count as already replaced
       +                                push(@substrings_new, [1, $words[$i], $words[$i]]);
       +                        }
                        }
                }
                @$substrings = @substrings_new;
       t@@ -1634,22 +1628,21 @@ statement on the text "c word1", there will still only be one chunk,
        properly.
        
        Once all the replacement statements have been processed, each chunk
       -of text that is not marked as transliterated yet is "trimmed" based on
       -the B<split> pattern specified in the config. This means that all
       -"lone" split characters are marked as transliterated and any other
       -untransliterated chunks have leading or trailing split characters
       -marked as transliterated. At this point, only chunks of actual text that
       -have not been transliterated are still marked as untransliterated.
       -These are now processed by the L<unknown word window|/"UNKNOWN WORD WINDOW">.
       -If one of these remaining unknown chunks is present in the file
       -specified by the B<ignore> statement in the config, it is simply ignored
       -and later printed out as is. After all untransliterated words have either
       -had a replacement added or been ignored, any words with multiple replacement
       -choices are processed by the word choice window. Once this is all done,
       -the final output is written to the output file and the process is
       -repeated with the next line. Note that the entire process is started
       -again each time a word is added to a table or the config is reloaded
       -from the L<unknown word window|/"UNKNOWN WORD WINDOW">.
       +of text that is not marked as transliterated yet is split based on
       +the B<split> pattern specified in the config and all actual characters
       +matched by the B<split> pattern are marked as transliterated (this
       +usually means all the spaces, newlines, quotation marks, etc.). Any
       +remaining words/text chunks that are still marked as untransliterated are
       +now processed by the unknown word window. If one of these remaining
       +unknown chunks is present in the file specified by the B<ignore>
       +statement in the config, it is simply ignored and later printed out
       +as is. After all untransliterated words have either had a replacement
       +added or been ignored, any words with multiple replacement choices are
       +processed by the word choice window. Once this is all done, the final
       +output is written to the output file and the process is repeated with
       +the next line. Note that the entire process is started again each time
       +a word is added to a table or the config is reloaded from the
       +L<unknown word window|/"UNKNOWN WORD WINDOW">.
        
        =head1 CONFIGURATION
        
       t@@ -1702,15 +1695,10 @@ otherwise all of the newlines will be marked as unknown words. Usually,
        this will be included anyways through C<\s>.
        
        Note also that B<split> should probably include the C<+> RegEx-quantifier
       -since that allows the splitting function in the end to also mark several
       -splitting characters in a row as transliterated.
       -
       -This is named a bit confusingly since it was originally used to split
       -the string completely based on the given pattern in the end. This was
       -changed later, so a better name now would be "trim", but it's already
       -called this way, so I don't feel like changing it. See the last
       -paragraph of L</"INTERNALS/EXAMPLES"> for a short description of how
       -the trimming works.
       +since that allows the splitting function in the end to ignore several
       +splitting characters right after each other (e.g. several spaces) in one
       +go instead of splitting the string again for every single one of them.
       +This shouldn't actually make any difference functionality-wise, though.
        
        B<Default:> C<\s+> (all whitespace)