URI: 
       tAdd diacritic handling; documentation not updated yet - transliterate - Transliteration engine
  HTML git clone git://lumidify.org/transliterate.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
   DIR commit ec431a30af78b0cc2f936c8064cb57acecff5414
   DIR parent 1679620c311e9d20a76f682e3207e9ba65d03b36
  HTML Author: lumidify <nobody@lumidify.org>
       Date:   Tue, 31 Mar 2020 15:48:27 +0200
       
       Add diacritic handling; documentation not updated yet
       
       Diffstat:
         A tests/test5/config                  |      20 ++++++++++++++++++++
         A tests/test5/descr.txt               |       1 +
         A tests/test5/err.txt                 |       3 +++
         A tests/test5/expected.txt            |       3 +++
         A tests/test5/input.txt               |       3 +++
         M transliterate.pl                    |      94 ++++++++++++++++++++++++++-----
       
       6 files changed, 109 insertions(+), 15 deletions(-)
       ---
   DIR diff --git a/tests/test5/config b/tests/test5/config
       t@@ -0,0 +1,20 @@
       +split "[ \n]+"
       +beforeword " "
       +afterword "[ \n]"
       +tablesep ","
       +choicesep "|"
       +
       +ignore "../data/ignore.txt"
       +table words "../data/words1.txt"
       +table endings "../data/endings_choices.txt"
       +
       +expand words endings
       +
       +match "w ord" "word" beginword nofinal
       +matchignore "-d" endword
       +
       +group beginword endword
       +replace words
       +endgroup
       +
       +diacritics "̈"
   DIR diff --git a/tests/test5/descr.txt b/tests/test5/descr.txt
       t@@ -0,0 +1 @@
       +matchignore only endword; expand noroot
   DIR diff --git a/tests/test5/err.txt b/tests/test5/err.txt
       t@@ -0,0 +1,3 @@
       +Unknown word: "word0"
       +Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices.
       +Unknown word: "-dword9end2"
   DIR diff --git a/tests/test5/expected.txt b/tests/test5/expected.txt
       t@@ -0,0 +1,3 @@
       +ignore
       +word0   word1_replacedend1r1|word1_replacedend1r2
       +-dword9end2 word9_replacedend2r-d
   DIR diff --git a/tests/test5/input.txt b/tests/test5/input.txt
       t@@ -0,0 +1,3 @@
       +ignore
       +wörd0   word1end1
       +-dword9end2 word9end2-d
   DIR diff --git a/transliterate.pl b/transliterate.pl
       t@@ -79,7 +79,7 @@ sub add_to_trie {
        # in the form ["<action name>", <optional args>].
        # See `handle_unknown_word_action` for currently accepted values
        sub prompt_unknown_word {
       -        my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_;
       +        my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $config, $cur_lineno, $config_error) = @_;
                my $action;
                my $stop = 0;
        
       t@@ -162,12 +162,43 @@ sub prompt_unknown_word {
                $vbox->pack_start($hbox, FALSE, FALSE, 10);
                $hbox->show;
        
       +        if (exists $config->{"diacritics"}) {
       +                $hbox = Gtk2::HBox->new(FALSE, 0);
       +                my $no_diacritic_label;
       +                my $stripped;
       +                my $accept_button = Gtk2::Button->new("Accept?");
       +                $accept_button->signal_connect(
       +                        clicked => sub {
       +                                $action = ["replace", @$stripped];
       +                                $window->destroy;
       +                        }, $window);
       +                $button = Gtk2::Button->new("Retry without diacritics");
       +                $button->signal_connect(
       +                        clicked => sub {
       +                                $stripped = replace_strip_diacritics($config, $word);
       +                                my $tmp = "";
       +                                foreach (@$stripped) {
       +                                        $tmp .= $_->[1];
       +                                }
       +                                $no_diacritic_label->set_text($tmp);
       +                                $accept_button->show;
       +                        }, $window);
       +                $hbox->pack_start($button, FALSE, FALSE, 0);
       +                $button->show;
       +                $hbox->pack_start($accept_button, FALSE, FALSE, 0);
       +                $no_diacritic_label = Gtk2::Label->new("");
       +                $hbox->pack_start($no_diacritic_label, FALSE, FALSE, 10);
       +                $no_diacritic_label->show;
       +                $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +                $hbox->show;
       +        }
       +
                $hbox = Gtk2::HBox->new(FALSE, 0);
                $label = Gtk2::Label->new("Add to list: ");
                $hbox->pack_start($label, FALSE, FALSE, 0);
                $label->show;
                my $path_list = Gtk2::ComboBox->new_text;
       -        foreach my $path (sort keys %$table_paths) {
       +        foreach my $path (sort keys %{$config->{"display_tables"}}) {
                        $path_list->append_text($path);
                }
                $hbox->pack_start($path_list, FALSE, FALSE, 0);
       t@@ -667,7 +698,8 @@ sub interpret_config {
                        "tablesep" => [$STRING],
                        "choicesep" => [$STRING],
                        "group" => [],
       -                "endgroup" => []
       +                "endgroup" => [],
       +                "diacritics" => [$STRING]
                );
                my $in_group = 0;
                foreach my $cmd (@$config_list) {
       t@@ -787,6 +819,13 @@ sub interpret_config {
                                        # loaded a table anyways
                                        my $trie_root = $config{"replacements"}->[-1]->{"words"};
                                        add_to_trie($table, $trie_root, $tables{$table}, $args, \%config);
       +                        } elsif ($cmd->[0]->{"value"} eq "diacritics") {
       +                                if (!exists $config{"diacritics"}) {
       +                                        $config{"diacritics"} = [];
       +                                }
       +                                foreach (1..$#$cmd) {
       +                                        push @{$config{"diacritics"}}, $cmd->[$_]->{"value"};
       +                                }
                                } elsif ($cmd->[0]->{"value"} eq "split") {
                                        $config{"split"} = $cmd->[1]->{"value"};
                                } elsif ($cmd->[0]->{"value"} eq "beforeword") {
       t@@ -857,7 +896,7 @@ sub load_config {
        # 1 - the current line needs to be re-transliterated with the new config
        # 2 - an error occurred while reloading the config
        sub handle_unknown_word_action {
       -        my ($action, $config, $args) = @_;
       +        my ($substrings, $index, $action, $config, $args) = @_;
                if ($action->[0] eq "ignore") {
                        $config->{"ignore_words"}->{$action->[2]} = "";
                        if ($action->[1] eq "permanent") {
       t@@ -915,7 +954,10 @@ sub handle_unknown_word_action {
                        } else {
                                return 2;
                        }
       +        } elsif ($action->[0] eq "replace") {
       +                splice @$substrings, $index, 1, @{$action}[1..$#$action];
                }
       +        return 0;
        }
        
        # Split $substrings into single words based on the "split" option
       t@@ -1131,6 +1173,14 @@ sub replace_group {
                @$substrings = @substrings_new;
        }
        
       +sub replace_strip_diacritics {
       +        my ($config, $word) = @_;
       +        foreach my $diacritic (@{$config->{"diacritics"}}) {
       +                $word =~ s/$diacritic//g;
       +        }
       +        return replace_line($config, $word);
       +}
       +
        # Perform all replacements on $line based on $config
        # $substrings: array of arrays - each one has three elements:
        # first 0 or 1, indicating if the substring following it has already
       t@@ -1166,11 +1216,14 @@ sub replace_line {
        # $args - the command line args
        # $cur_lineno - display string to show the user the current line number
        # Returns:
       -# 1 - the current line must be replaced again with the new config
       -# 0 - everything's fine
       +# -1 - all done
       +# anything else - the substrings must be replaced again starting at the returned index
        sub get_unknown_words {
                my ($substrings, $config, $args, $cur_lineno) = @_;
       -        foreach my $i (0..$#$substrings) {
       +        my $i = 0;
       +        # this is done so $substrings can be modified during the loop
       +        # (instead of just foreach)
       +        while ($i <= $#$substrings) {
                        my $word = $substrings->[$i];
                        if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
                                my $contextl = "";
       t@@ -1187,26 +1240,27 @@ sub get_unknown_words {
                                }
                                my $action = prompt_unknown_word($contextl, $contextl_orig,
                                        $word->[1], $contextr, $contextr_orig,
       -                                $config->{"display_tables"}, "$cur_lineno"
       +                                $config, "$cur_lineno"
                                );
                                # if $ret == 2, config could not be loaded
                                # if $ret == 1, line must be redone with new config
       -                        my $ret = handle_unknown_word_action($action, $config, $args);
       +                        my $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args);
                                # keep retrying until the user chooses an action which
                                # didn't throw an error
                                while ($ret == 2) {
                                        $action = prompt_unknown_word($contextl, $contextl_orig,
                                                $word->[1], $contextr, $contextr_orig,
       -                                        $config->{"display_tables"}, "$cur_lineno", 1);
       -                                $ret = handle_unknown_word_action($action, $config, $args);
       +                                        $config, "$cur_lineno", 1);
       +                                $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args);
                                }
                                # re-transliterate the line with the new config
                                if ($ret == 1) {
       -                                return 1;
       +                                return $i;
                                }
                        }
       +                $i++;
                }
       -        return 0;
       +        return -1;
        }
        
        # Main replacement function
       t@@ -1228,8 +1282,18 @@ sub replace {
        
                        if (!$args->{"nounknowns"}) {
                                # re-transliterate the string if the config was reloaded
       -                        while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
       -                                $substrings = replace_line($config, $nfd_line);
       +                        my $start_index = 0;
       +                        while ($start_index != -1) {
       +                                $start_index = get_unknown_words($substrings, $config, $args, "$./$total_lines", $start_index);
       +                                if ($start_index != -1) {
       +                                        my $str = "";
       +                                        foreach ($start_index..$#$substrings) {
       +                                                $str .= $substrings->[$_]->[2];
       +                                        }
       +                                        my $new_substrings = replace_line($config, $str);
       +                                        splice @$substrings, $start_index;
       +                                        push @$substrings, @$new_substrings;
       +                                }
                                }
                        } elsif ($args->{"debug"}) {
                                foreach my $s (@$substrings) {