lumidify.org/1/git/transliterate/commit/d08734f2e255b3f84a1e3f62df38e404800650c9.gph

  URI:

       tDon't split compound words before asking for unknown words - transliterate - Transliteration engine
  HTML git clone git://lumidify.org/transliterate.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
   DIR commit d08734f2e255b3f84a1e3f62df38e404800650c9
   DIR parent ec431a30af78b0cc2f936c8064cb57acecff5414
  HTML Author: lumidify <nobody@lumidify.org>
       Date:   Tue, 31 Mar 2020 16:20:19 +0200
       
       Don't split compound words before asking for unknown words
       
       Diffstat:
         M tests/test5/input.txt               |       2 +-
         M transliterate.pl                    |      31 ++++++++++++++++++++++++++++++-
       
       2 files changed, 31 insertions(+), 2 deletions(-)
       ---
   DIR diff --git a/tests/test5/input.txt b/tests/test5/input.txt
       t@@ -1,3 +1,3 @@
        ignore
       -wörd0   word1end1
       +wörd0 wörd0  word1end1
        -dword9end2 word9end2-d
   DIR diff --git a/transliterate.pl b/transliterate.pl
       t@@ -960,14 +960,42 @@ sub handle_unknown_word_action {
                return 0;
        }
        
       +# FIXME: This only splits off "lone" split characters or those at the border to a
       +# transliterated block, in oder to keep compound words together for replace. The
       +# cruft needs to be removed at some point.
        # Split $substrings into single words based on the "split" option
        # in $config.
        # $substrings can already be split at this point; only the
        # ones that haven't been transliterated yet are modified
        sub split_words {
                my ($config, $substrings) = @_;
       -        my @substrings_new;
       +        # FIXME: is it more efficient to pre-compile with \A and \z individually?
                my $split_re = qr/($config->{"split"})/;
       +        my @substrings_new;
       +        #FIXME: cleanup
       +        foreach my $cur_substr (@$substrings) {
       +                if ($cur_substr->[0] == 1) {
       +                        push(@substrings_new, $cur_substr);
       +                        next;
       +                }
       +                my $str = $cur_substr->[1];
       +                if ($str =~ /\A$split_re/) {
       +                        push @substrings_new, [1, $1, $1];
       +                        $str = substr $str, length($1);
       +                }
       +                next if $str eq "";
       +                if ($str =~ /$split_re\z/) {
       +                        $str = substr $str, 0, -length($1);
       +                        push @substrings_new, [0, $str, $str];
       +                        push @substrings_new, [1, $1, $1];
       +                } else {
       +                        push @substrings_new, [0, $str, $str];
       +                }
       +        }
       +        @$substrings = @substrings_new;
       +=pod
       +        # FIXME: this is *probably* not needed anymore
       +        my @substrings_new;
                foreach my $cur_substr (@$substrings) {
                        if ($cur_substr->[0] == 1) {
                                push(@substrings_new, $cur_substr);
       t@@ -988,6 +1016,7 @@ sub split_words {
                        }
                }
                @$substrings = @substrings_new;
       +=cut
        }
        
        # small helper function to add a untransliterated string to the last substring