URI: 
       tAdd debugging option for match and group statements - transliterate - Transliteration engine
  HTML git clone git://lumidify.org/transliterate.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
   DIR commit b174c1557b4acae8b3f30c0469d7b8f26c27bb8b
   DIR parent c78c54b0fbc4a84c11673206c0a1fb6526cd6136
  HTML Author: lumidify <nobody@lumidify.org>
       Date:   Fri, 17 Apr 2020 09:50:23 +0200
       
       Add debugging option for match and group statements
       
       Diffstat:
         M tests/runtest.sh                    |       2 +-
         M transliterate.pl                    |     116 +++++++++++++++++++++++--------
       
       2 files changed, 87 insertions(+), 31 deletions(-)
       ---
   DIR diff --git a/tests/runtest.sh b/tests/runtest.sh
       t@@ -1,6 +1,6 @@
        #!/bin/sh
        
       -../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1
       +../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debugspecial "$1/input.txt" > runtest_err.txt 2>&1
        diff "$1/expected.txt" runtest.txt > /dev/null
        err1=$?
        diff "$1/err.txt" runtest_err.txt > /dev/null
   DIR diff --git a/transliterate.pl b/transliterate.pl
       t@@ -89,7 +89,10 @@ sub add_to_trie {
        # in the form ["<action name>", <optional args>].
        # See `handle_unknown_word_action` for currently accepted values
        sub prompt_unknown_word {
       -        my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig, $config, $cur_lineno, $config_error) = @_;
       +        # yes, this function really should take fewer arguments...
       +        # it would be better to just pass the substrings and an index
       +        my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig,
       +            $config, $cur_lineno, $args, $config_error) = @_;
                my $action;
                my $stop = 0;
        
       t@@ -208,7 +211,7 @@ sub prompt_unknown_word {
                                $button->signal_connect(
                                        clicked => sub {
                                                my @chars = @{$without}[1..$#$without];
       -                                        my $stripped = replace_strip_chars($config, \@chars, $word);
       +                                        my $stripped = replace_strip_chars($config, $args, \@chars, $word);
                                                # recombine substrings
                                                my $repl_text = "";
                                                $repl_text .= $_->[1] foreach @$stripped;
       t@@ -827,9 +830,12 @@ sub interpret_config {
                                                warn "ERROR: New group started without ending last one in config\n";
                                                return;
                                        }
       -                                push @{$config{"replacements"}}, {"type" => "group", "words" => {}};
       +                                push @{$config{"replacements"}}, {
       +                                        "type" => "group", "tables" => [],
       +                                        "words" => {}, "options" => {}};
       +                                # add all options such as "endword" to the options hash
                                        for (1..$#$cmd) {
       -                                        $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
       +                                        $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
                                        }
                                        $in_group = 1;
                                } elsif ($cmd_name eq "endgroup") {
       t@@ -845,20 +851,24 @@ sub interpret_config {
                                        }
                                        push @{$config{"replacements"}}, {
                                                "type" => "match",
       +                                        "options" => {},
                                                "search" => NFD($cmd->[1]->{"value"}),
                                                "replace" => $cmd->[2]->{"value"}};
                                        for (3..$#$cmd) {
       -                                        # add optional arguments as keys in replacement config
       -                                        $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
       +                                        # add optional arguments as keys in options hash
       +                                        $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
                                        }
                                } elsif ($cmd_name eq "matchignore") {
                                        if ($in_group) {
                                                warn "ERROR: matchignore command is invalid inside group\n";
                                                return;
                                        }
       -                                push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})};
       +                                push @{$config{"replacements"}}, {
       +                                        "type" => "match",
       +                                        "options" => {},
       +                                        "search" => NFD($cmd->[1]->{"value"})};
                                        for (2..$#$cmd) {
       -                                        $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
       +                                        $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
                                        }
                                } elsif ($cmd_name eq "replace") {
                                        if (!$in_group) {
       t@@ -879,6 +889,9 @@ sub interpret_config {
                                        $config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
                                        push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];
        
       +                                # store list of tables for --debug
       +                                push @{$config{"replacements"}->[-1]->{"tables"}}, $table;
       +
                                        # Note: we don't need to check if $table{"choicesep"} was defined
                                        # here since we can't ever get this far without first having
                                        # loaded a table anyways
       t@@ -1080,9 +1093,9 @@ sub push_unknown {
        #        specifies if the match is only valid when $config->{"beforeword"}
        #        or $config->{"afterword"} occur before or after it, respectively
        sub replace_match {
       -        my ($config, $replace_config, $substrings) = @_;
       -        my $beginword = exists $replace_config->{"beginword"};
       -        my $endword = exists $replace_config->{"endword"};
       +        my ($config, $args, $replace_config, $substrings, $debug_msg) = @_;
       +        my $beginword = exists $replace_config->{"options"}->{"beginword"};
       +        my $endword = exists $replace_config->{"options"}->{"endword"};
                my $fullword = $beginword && $endword;
                my $beforeword = $config->{"beforeword"};
                my $afterword = $config->{"afterword"};
       t@@ -1118,6 +1131,9 @@ sub replace_match {
                        my $i1 = 0;
                        while ($substrings->[$i]->[1] =~ m/$word/g) {
                                if (!$found_word) {
       +                                if ($args->{"debug"}) {
       +                                        print $debug_msg;
       +                                }
                                        $found_word = 1;
                                        if ($i != 0) {
                                                push(@substrings_new, @{$substrings}[0..$i-1]);
       t@@ -1146,9 +1162,15 @@ sub replace_match {
                                }
                                my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
                                my $replace_str = $replace_word // $orig_str;
       -                        if ($replace_config->{"nofinal"}) {
       +                        if ($replace_config->{"options"}->{"nofinal"}) {
       +                                if ($args->{"debug"}) {
       +                                        warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n";
       +                                }
                                        push_unknown \@substrings_new, $orig_str, $replace_str;
                                } else {
       +                                if ($args->{"debug"}) {
       +                                        warn "Replaced \"$orig_str\" with \"$replace_str\"\n";
       +                                }
                                        push(@substrings_new, [1, $replace_str, $orig_str]);
                                }
                                $last_idx = $i1;
       t@@ -1168,8 +1190,9 @@ sub replace_match {
        # $replace_config->{"beginword"}, $replace_config->{"endword"} -
        #        same as in `replace_match`
        sub replace_group {
       -        my ($config, $replace_config, $substrings) = @_;
       +        my ($config, $args, $replace_config, $substrings, $debug_msg) = @_;
                my @substrings_new;
       +        my $anything_replaced = 0;
                # Recurse backwords towards the root node of the trie to find the first
                # node with a key "final" which satisfies the ending condition (if "endword" is set)
                # Returns the id *after* the last match and the node that was found
       t@@ -1178,7 +1201,7 @@ sub replace_group {
                        my ($i, $tmp_cur_node, $s) = @_;
                        do {
                                my $after_ch = substr($s->[1], $i, 1);
       -                        if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) ||
       +                        if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) ||
                                        $after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
                                        return ($i, $tmp_cur_node);
                                }
       t@@ -1207,7 +1230,7 @@ sub replace_group {
                                if (exists $cur_node->{$ch}) {
                                        if ($cur_node == $replace_config->{"words"}) {
                                                my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
       -                                        if (exists($replace_config->{"beginword"}) &&
       +                                        if (exists($replace_config->{"options"}->{"beginword"}) &&
                                                        $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
                                                        push_unknown \@substrings_new, $ch;
                                                        $i++;
       t@@ -1223,7 +1246,16 @@ sub replace_group {
                                                push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
                                                $i += 2;
                                        } else {
       -                                        push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]);
       +                                        my $orig = substr($s->[1], $start_i, $i-$start_i);
       +                                        my $final = $tmp_cur_node->{"final"};
       +                                        if ($args->{"debug"}) {
       +                                                if (!$anything_replaced) {
       +                                                        warn $debug_msg;
       +                                                        $anything_replaced = 1;
       +                                                }
       +                                                warn "Replaced \"$orig\" with \"$final\"\n";
       +                                        }
       +                                        push(@substrings_new, [1, $final, $orig]);
                                        }
                                        $cur_node = $replace_config->{"words"};
                                        next;
       t@@ -1239,11 +1271,11 @@ sub replace_group {
        # Perform all replacements on $word, first removing all
        # characters specified in $chars
        sub replace_strip_chars {
       -        my ($config, $chars, $word) = @_;
       +        my ($config, $args, $chars, $word) = @_;
                foreach my $char (@$chars) {
                        $word =~ s/\Q$char\E//g;
                }
       -        return replace_line($config, $word);
       +        return replace_line($config, $args, $word);
        }
        
        # Perform all replacements on $line based on $config
       t@@ -1253,13 +1285,29 @@ sub replace_strip_chars {
        # transliterated string, and lastly the original string.
        # If the first element is 0, the second two elements are obviously same
        sub replace_line {
       -        my ($config, $line) = @_;
       +        my ($config, $args, $line) = @_;
                my $substrings = [[0, $line, $line]];
                foreach my $replacement (@{$config->{"replacements"}}) {
                        if ($replacement->{"type"} eq "match") {
       -                        replace_match($config, $replacement, $substrings);
       +                        my $debug_msg;
       +                        if ($args->{"debug"}) {
       +                                my $options = join " ", keys(%{$replacement->{"options"}});
       +                                $debug_msg =  "Match ($options): \"$replacement->{search}\"";
       +                                if ($replacement->{"replace"}) {
       +                                        $debug_msg .= " \"$replacement->{replace}\"\n";
       +                                } else {
       +                                        $debug_msg .= " (ignore)\n";
       +                                }
       +                        }
       +                        replace_match($config, $args, $replacement, $substrings, $debug_msg);
                        } elsif ($replacement->{"type"} eq "group") {
       -                        replace_group($config, $replacement, $substrings);
       +                        my $debug_msg;
       +                        if ($args->{"debug"}) {
       +                                my $options = join " ", keys(%{$replacement->{"options"}});
       +                                my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"';
       +                                $debug_msg = "Group ($options): $tables\n";
       +                        }
       +                        replace_group($config, $args, $replacement, $substrings, $debug_msg);
                        }
                }
                # splits all words at the end so that the splitting characters
       t@@ -1294,8 +1342,7 @@ sub call_unknown_word_window {
                }
                my $action = prompt_unknown_word($contextl, $contextl_orig,
                        $word->[1], $word->[2], $contextr, $contextr_orig,
       -                $config, "$cur_lineno"
       -        );
       +                $config, "$cur_lineno", $args);
                # if $ret == 3, rest of line should be skipped
                # if $ret == 2, config could not be loaded
                # if $ret == 1, line must be redone with new config
       t@@ -1305,7 +1352,7 @@ sub call_unknown_word_window {
                while ($ret == 2) {
                        $action = prompt_unknown_word($contextl, $contextl_orig,
                                $word->[1], $word->[2], $contextr, $contextr_orig,
       -                        $config, "$cur_lineno", 1);
       +                        $config, "$cur_lineno", $args, 1);
                        $ret = handle_unknown_word_action($action, $config, $args);
                }
                return $ret;
       t@@ -1359,14 +1406,14 @@ sub replace {
                                $comment = $1;
                        }
                        my $nfd_line = NFD($line);
       -                my $substrings = replace_line($config, $nfd_line);
       +                my $substrings = replace_line($config, $args, $nfd_line);
        
                        if (!$args->{"nounknowns"}) {
                                # re-transliterate the string if the config was reloaded
                                while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
       -                                $substrings = replace_line($config, $nfd_line);
       +                                $substrings = replace_line($config, $args, $nfd_line);
                                }
       -                } elsif ($args->{"debug"}) {
       +                } elsif ($args->{"debugspecial"}) {
                                foreach my $s (@$substrings) {
                                        if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
                                                warn "Unknown word: \"$s->[1]\"\n";
       t@@ -1378,10 +1425,10 @@ sub replace {
                                # "Open in unknown word window"
                                while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) {
                                        if ($ret == 1) {
       -                                        $substrings = replace_line($config, $nfd_line);
       +                                        $substrings = replace_line($config, $args, $nfd_line);
                                        }
                                }
       -                } elsif ($args->{"debug"}) {
       +                } elsif ($args->{"debugspecial"}) {
                                foreach my $s (@$substrings) {
                                        if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
                                                my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
       t@@ -1399,7 +1446,7 @@ sub replace {
        
        my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
        GetOptions(
       -        \%args, "debug",
       +        \%args, "debug", "debugspecial",
                "nochoices", "nounknowns",
                "force", "start=i",
                "output=s", "config=s",
       t@@ -1542,6 +1589,15 @@ prompts.
        
        =item B<--debug>
        
       +Prints information helpful for debugging problems with the B<match> and B<group>
       +statements.
       +
       +For each B<match> or B<group> statement which replaces anything, the original
       +statement is printed (the format is a bit different than in the config) and
       +each actual word that's replaced is printed.
       +
       +=item B<--debugspecial>
       +
        This option is only useful for automatic testing of the transliteration engine.
        
        If B<--nochoices> is enabled, each word in the input with multiple choices will