URI: 
       transliterate.pl - transliterate - Transliteration engine
  HTML git clone git://lumidify.org/transliterate.git (fast, but not encrypted)
  HTML git clone https://lumidify.org/transliterate.git (encrypted, but very slow)
  HTML git clone git://4kcetb7mo7hj6grozzybxtotsub5bempzo4lirzc3437amof2c2impyd.onion/transliterate.git (over tor)
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       transliterate.pl (83637B)
       ---
            1 #!/usr/bin/env perl
            2 
            3 # Proudly written using vi (OpenBSD nvi)
            4 
            5 # NOTE: If you're wondering why the error codes used by the functions are so
            6 # inconsistent, go ask my former self
            7 
            8 # NOTE 2: This codebase has grown as new features were needed, so it's quite
            9 # ugly now, but I haven't had time to clean it up.
           10 
           11 use strict;
           12 use warnings;
           13 use utf8;
           14 use feature 'unicode_strings';
           15 use open qw< :encoding(UTF-8) >;
           16 binmode STDIN, ":encoding(UTF-8)";
           17 binmode STDOUT, ":encoding(UTF-8)";
           18 binmode STDERR, ":encoding(UTF-8)";
           19 use Unicode::Normalize;
           20 use Glib qw/TRUE FALSE/;
           21 use Gtk3 '-init';
           22 use Getopt::Long;
           23 use Pod::Usage;
           24 use Scalar::Util qw(weaken);
           25 use File::Basename qw(dirname);
           26 use File::Spec::Functions qw(rel2abs file_name_is_absolute);
           27 
           28 # takes a string of words separated by '$config->{choicesep}' and returns a new string in the
           29 # same format with duplicates removed
           30 sub get_unique_words {
           31         my ($word, $config) = @_;
           32         my %tmp;
           33         my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
           34         return join $config->{choicesep}, @words_uniq;
           35 }
           36 
           37 # Adds all words in $words to $trie
           38 # Automatically combines duplicate words with "$config->{choicesep}" inbetween
           39 sub add_to_trie {
           40         my ($table_name, $trie, $words, $args, $config, $override) = @_;
           41         foreach my $word (keys %$words) {
           42                 my $cur_node = $trie;
           43                 foreach my $char (split //, $word) {
           44                         if (!exists($cur_node->{$char})) {
           45                                 $cur_node->{$char}->{"parent"} = $cur_node;
           46                                 # This is required to avoid circular references
           47                                 # (otherwise, the garbage collector doesn't ever
           48                                 # destroy these nodes, leading to the memory
           49                                 # consumption growing without restraint if
           50                                 # "Reload config" is used)
           51                                 weaken($cur_node->{$char}->{"parent"});
           52                         }
           53                         $cur_node = $cur_node->{$char};
           54                 }
           55                 if (exists($cur_node->{"final"})) {
           56                         if ($override) {
           57                                 $cur_node->{"final"} = $words->{$word};
           58                                 next;
           59                         }
           60                         if ($args->{"checkduplicates"}) {
           61                                 warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
           62                                         "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
           63                                         "current occurrence as \"$words->{$word}\" in " .
           64                                         "table \"$table_name\.\n";
           65                         }
           66                         $cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
           67                 } else {
           68                         $cur_node->{"final"} = $words->{$word};
           69                         if ($args->{"checkduplicates"}) {
           70                                 $cur_node->{"table_name"} = $table_name;
           71                         }
           72                 }
           73         }
           74 }
           75 
           76 # Prompt user when no replacement has been found for a word
           77 # $word is the word that was not found and $context* the context,
           78 # $word_repl is the replacement word - this is only used when the window is
           79 #            called from the word choice window, since the original and
           80 #            replacement aren't the same then
           81 # with context*_orig being the original, non-transliterated context.
           82 # $table_paths is a mapping of table paths (here, only the keys, i.e.
           83 # the actual paths, are used) to allow the user to choose a table to
           84 # save a new replacement to.
           85 # $cur_lineno is a display string to show the current line number
           86 # $config_error is an optional flag to specify whether an error
           87 # message should be displayed, informing the user that the config
           88 # could not be loaded (used when "Reload config" is clicked)
           89 # Returns: an array reference containing an action to be taken,
           90 # in the form ["<action name>", <optional args>].
           91 # See `handle_unknown_word_action` for currently accepted values
           92 sub prompt_unknown_word {
           93         # yes, this function really should take fewer arguments...
           94         # it would be better to just pass the substrings and an index
           95         my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig,
           96             $config, $cur_lineno, $args, $config_error) = @_;
           97         my $action;
           98         my $stop = 0;
           99 
          100         my $window = Gtk3::Window->new('toplevel');
          101         $window->signal_connect(delete_event => sub {return FALSE});
          102         $window->signal_connect(destroy => sub { Gtk3->main_quit; });
          103         $window->set_border_width(10);
          104 
          105         my $vbox = Gtk3::VBox->new(FALSE, 10);
          106 
          107         my $linelabel = Gtk3::Label->new("Current line: $cur_lineno");
          108         $vbox->pack_start($linelabel, FALSE, FALSE, 0);
          109         $linelabel->show;
          110 
          111         my $wordlabel = Gtk3::Label->new("Word not found: $word");
          112         $wordlabel->set_alignment(0.0, 0.0);
          113         $vbox->pack_start($wordlabel, FALSE, FALSE, 0);
          114         $wordlabel->show;
          115 
          116         # Make a text box with the given left and right context and label
          117         # Also creates a button allowing the user to set the currently
          118         # selected text as the word to be replaced - useful when only part
          119         # of the entire word that was not found has to be replaced
          120         my $make_context_box = sub {
          121                 # look, don't blame me for these miserably named variables...
          122                 my ($ctxtl, $wrd, $ctxtr, $lbl, $btn_lbl) = @_;
          123                 my $hbox = Gtk3::HBox->new(FALSE, 5);
          124                 my $label = Gtk3::Label->new_with_mnemonic($lbl);
          125                 my $text = Gtk3::TextView->new;
          126                 $label->set_mnemonic_widget($text);
          127                 $text->set_wrap_mode("word");
          128                 my $buffer = $text->get_buffer();
          129                 $buffer->set_text($ctxtr);
          130                 my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
          131                 my $start_iter = $buffer->get_start_iter();
          132                 $buffer->insert_with_tags($start_iter, $wrd, $highlight);
          133                 $start_iter = $buffer->get_start_iter();
          134                 $buffer->insert($start_iter, $ctxtl);
          135                 # set cursor position to beginning of word
          136                 my $cur_iter = $buffer->get_iter_at_offset(length($ctxtl));
          137                 $buffer->place_cursor($cur_iter);
          138                 my $button = Gtk3::Button->new($btn_lbl);
          139                 $button->signal_connect(
          140                         clicked => sub {
          141                                 if (my ($start, $end) = $buffer->get_selection_bounds()) {
          142                                         $word = $buffer->get_text($start, $end, FALSE);
          143                                         $wordlabel->set_text("Selected: $word");
          144                                 }
          145                         }, $window);
          146                 $hbox->pack_start($label, FALSE, FALSE, 0);
          147                 $hbox->pack_start($text, TRUE, TRUE, 0);
          148                 $vbox->pack_start($hbox, FALSE, FALSE, 0);
          149                 $hbox = Gtk3::HBox->new(FALSE, 5);
          150                 $hbox->pack_start($button, FALSE, FALSE, 0);
          151                 my $complete_text = $ctxtl . $wrd . $ctxtr;
          152                 $button = Gtk3::Button->new("Reset text");
          153                 $button->signal_connect(
          154                         clicked => sub {
          155                                 $buffer->set_text($complete_text);
          156                         }, $window);
          157                 $hbox->pack_start($button, FALSE, FALSE, 0);
          158                 $vbox->pack_start($hbox, FALSE, FALSE, 0);
          159         };
          160         $make_context_box->(
          161             $contextl_orig, $word, $contextr_orig,
          162             "Ori_ginal context: ",
          163             "Use _original selection as word"
          164         );
          165         $make_context_box->(
          166             $contextl, $word_repl, $contextr,
          167             "Transliterated _context: ",
          168             "Use _transliterated selection as word"
          169         );
          170 
          171         my $hbox = Gtk3::HBox->new(FALSE, 5);
          172         my $label = Gtk3::Label->new("Ignore: ");
          173         $hbox->pack_start($label, FALSE, FALSE, 0);
          174         my $button = Gtk3::Button->new("Th_is run");
          175         $button->signal_connect(
          176                 clicked => sub {
          177                         $action = ["ignore", "run", $word];
          178                         $window->destroy;
          179                 }, $window);
          180         $hbox->pack_start($button, FALSE, FALSE, 0);
          181         $button = Gtk3::Button->new("_Permanently");
          182         $button->signal_connect(
          183                 clicked => sub {
          184                         $action = ["ignore", "permanent", $word];
          185                         $window->destroy;
          186                 }, $window);
          187         $hbox->pack_start($button, FALSE, FALSE, 0);
          188         $button = Gtk3::Button->new("_Whole line");
          189         $button->signal_connect(
          190                 clicked => sub {
          191                         $action = ["ignore", "wholeline"];
          192                         $window->destroy;
          193                 }, $window);
          194         $hbox->pack_start($button, FALSE, FALSE, 0);
          195         $vbox->pack_start($hbox, FALSE, FALSE, 0);
          196 
          197         # AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE!
          198         # Take note, kids - this is what happens when you keep adding
          199         # features without rethinking your basic design.
          200 
          201         $hbox = Gtk3::HBox->new(FALSE, 5);
          202         $label = Gtk3::Label->new_with_mnemonic("Add to _list: ");
          203         $hbox->pack_start($label, FALSE, FALSE, 0);
          204         my $path_list = Gtk3::ComboBoxText->new;
          205         $label->set_mnemonic_widget($path_list);
          206         foreach my $path (sort keys %{$config->{"display_tables"}}) {
          207                 $path_list->append_text($path);
          208         }
          209         $hbox->pack_start($path_list, FALSE, FALSE, 0);
          210         $vbox->pack_start($hbox, FALSE, FALSE, 0);
          211 
          212         $hbox = Gtk3::HBox->new(FALSE, 5);
          213         $label = Gtk3::Label->new_with_mnemonic("_Replacement: ");
          214         $hbox->pack_start($label, FALSE, FALSE, 0);
          215         my $replace_entry = Gtk3::Entry->new;
          216         $label->set_mnemonic_widget($replace_entry);
          217         $hbox->pack_start($replace_entry, TRUE, TRUE, 0);
          218         $vbox->pack_start($hbox, FALSE, FALSE, 0);
          219 
          220         if (exists $config->{"retrywithout"}) {
          221                 $hbox = Gtk3::HBox->new(FALSE, 5);
          222                 # Pressing Alt+e just activates the first of the retrywithout buttons
          223                 $label = Gtk3::Label->new_with_mnemonic("Retry without: ");
          224                 $hbox->pack_start($label, FALSE, FALSE, 0);
          225                 foreach my $without (@{$config->{"retrywithout"}}) {
          226                         $button = Gtk3::Button->new("$without->[0]");
          227                         $button->signal_connect(
          228                                 clicked => sub {
          229                                         my @chars = @{$without}[1..$#$without];
          230                                         my $stripped = replace_strip_chars($config, $args, \@chars, $word);
          231                                         # recombine substrings
          232                                         my $repl_text = "";
          233                                         $repl_text .= $_->[1] foreach @$stripped;
          234                                         $replace_entry->set_text($repl_text);
          235                                 }, $window);
          236                         $hbox->pack_start($button, FALSE, FALSE, 0);
          237                 }
          238                 $vbox->pack_start($hbox, FALSE, FALSE, 0);
          239         }
          240 
          241         $hbox = Gtk3::HBox->new(FALSE, 0);
          242         $button = Gtk3::Button->new("_Add replacement");
          243         $button->signal_connect(
          244                 clicked => sub {
          245                         if ($path_list->get_active != -1) {
          246                                 $action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
          247                                 $window->destroy;
          248                         }
          249                 }, $window);
          250         $hbox->pack_start($button, FALSE, FALSE, 0);
          251         $vbox->pack_start($hbox, FALSE, FALSE, 0);
          252 
          253         $hbox = Gtk3::HBox->new(FALSE, 5);
          254         $button = Gtk3::Button->new("_Stop processing");
          255         $button->signal_connect(
          256                 clicked => sub {
          257                         $stop = 1;
          258                         $window->destroy;
          259                 }, $window);
          260         $hbox->pack_start($button, FALSE, FALSE, 0);
          261 
          262         $button = Gtk3::Button->new("Reload config");
          263         $button->signal_connect(
          264                 clicked => sub {
          265                         $action = ["reload"];
          266                         $window->destroy;
          267                 }, $window);
          268         $hbox->pack_start($button, FALSE, FALSE, 0);
          269 
          270         if ($config_error) {
          271                 $label = Gtk3::Label->new("Error loading config; see terminal output for details");
          272                 $hbox->pack_start($label, FALSE, FALSE, 0);
          273         }
          274         $vbox->pack_start($hbox, FALSE, FALSE, 0);
          275 
          276         $window->add($vbox);
          277         $window->show_all;
          278         Gtk3->main;
          279 
          280         die "Processing stopped at line $cur_lineno\n" if $stop;
          281 
          282         if (!$action) {
          283                 # This action isn't explicitly handled, but at least nothing
          284                 # breaks when the window is closed without selecting an action
          285                 $action = ["dummy"];
          286         }
          287         return $action;
          288 }
          289 
          290 # Prompt the user when a word has multiple replacement options (separated by $config->{choicesep})
          291 # $cur_lineno - display string to show the current line number
          292 # Returns:
          293 # 3, if this window needs to be called again but nothing needs
          294 #    to be re-transliterated
          295 # 1, if the line needs to be re-transliterated
          296 # 0, if nothing needs to be done
          297 sub prompt_choose_word {
          298         my ($substrings, $config, $args, $cur_lineno) = @_;
          299 
          300         # make a list of all substrings that contain multiple word options
          301         my @replacements;
          302         foreach (0..$#$substrings) {
          303                 if ($substrings->[$_]->[0] && $substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
          304                         # This ugly bit of code is here as a special case for transliterating
          305                         # Hindi to Urdu text - if there are *exactly* two choices and one
          306                         # contains diacritics but the other one doesn't, the one with diacritics
          307                         # is automatically used instead of prompting the user.
          308                         if (exists $config->{"targetdiacritics"}) {
          309                                 my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1];
          310                                 my @diacritics = @{$config->{"targetdiacritics"}};
          311                                 if (@choices == 2) {
          312                                         @choices = map {NFD($_)} @choices;
          313                                         my $first_matches = grep {$choices[0] =~ /$_/} @diacritics;
          314                                         my $second_matches = grep {$choices[1] =~ /$_/} @diacritics;
          315                                         if ($first_matches && !$second_matches) {
          316                                                 $substrings->[$_]->[1] = $choices[0];
          317                                                 next;
          318                                         } elsif (!$first_matches && $second_matches) {
          319                                                 $substrings->[$_]->[1] = $choices[1];
          320                                                 next;
          321                                         }
          322                                 }
          323                         }
          324                         # Format of the elements in @replacements:
          325                         # [<id of substrings in $substrings>, <replacement word>, <original string>]
          326                         push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
          327                 }
          328         }
          329         # no substrings have multiple options
          330         return if (!@replacements);
          331 
          332         my $stop = 0;
          333         my $open_unknown = 0;
          334         my $cur_replacement = 0;
          335 
          336         my $window = Gtk3::Window->new('toplevel');
          337         $window->signal_connect(delete_event => sub {return FALSE});
          338         $window->signal_connect(destroy => sub { Gtk3->main_quit; });
          339         $window->set_border_width(10);
          340 
          341         my $vbox = Gtk3::VBox->new(FALSE, 0);
          342 
          343         my $linelabel = Gtk3::Label->new("Current line: $cur_lineno");
          344         $vbox->pack_start($linelabel, FALSE, FALSE, 0);
          345 
          346         my $wordlabel = Gtk3::Label->new("");
          347         $wordlabel->set_alignment(0.0, 0.0);
          348         $vbox->pack_start($wordlabel, FALSE, FALSE, 0);
          349 
          350         my $undo = Gtk3::Button->new("_Undo");
          351         $vbox->pack_start($undo, FALSE, FALSE, 0);
          352         $undo->set_sensitive(FALSE);
          353 
          354         my $button_vbox = Gtk3::VBox->new(FALSE, 0);
          355         $vbox->pack_start($button_vbox, FALSE, FALSE, 0);
          356 
          357         my $accept = Gtk3::Button->new("_Accept changes?");
          358         $vbox->pack_start($accept, FALSE, FALSE, 0);
          359 
          360         my $hbox = Gtk3::HBox->new(FALSE, 0);
          361         my $label = Gtk3::Label->new("Context: ");
          362         my $text = Gtk3::TextView->new;
          363         $text->set_wrap_mode("word");
          364         my $buffer = $text->get_buffer();
          365         my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
          366         $text->set_editable(FALSE);
          367         $hbox->pack_start($label, FALSE, FALSE, 0);
          368         $hbox->pack_start($text, TRUE, TRUE, 10);
          369         $vbox->pack_start($hbox, FALSE, FALSE, 10);
          370 
          371         $hbox = Gtk3::HBox->new(FALSE, 5);
          372         my $skip_button = Gtk3::Button->new("Sk_ip word");
          373         $hbox->pack_start($skip_button, FALSE, FALSE, 0);
          374         my $unknown_button = Gtk3::Button->new("_Open in unknown word window");
          375         $hbox->pack_start($unknown_button, FALSE, FALSE, 0);
          376         my $stop_button = Gtk3::Button->new("_Stop processing");
          377         $hbox->pack_start($stop_button, FALSE, FALSE, 0);
          378         $vbox->pack_start($hbox, FALSE, FALSE, 0);
          379 
          380         # generate the context to the left and to the right of the current word being replaced
          381         my $get_context = sub {
          382                 my ($contextl, $contextr) = ("", "");
          383                 my $tmp_replacement = 0;
          384                 foreach (0..$#$substrings) {
          385                         my $word = $substrings->[$_]->[1];
          386                         if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
          387                                 $word = $replacements[$tmp_replacement]->[1];
          388                                 $tmp_replacement++;
          389                         }
          390                         # When nothing is left to replace, the entire string is in $contextl
          391                         if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
          392                                 $contextl .= $word;
          393                         } elsif ($_ > $replacements[$cur_replacement]->[0]) {
          394                                 $contextr .= $word;
          395                         }
          396                 }
          397                 return ($contextl, $contextr);
          398         };
          399 
          400         # fill the text buffer with the context and current word, highlighting the word
          401         # if $cur_replacement is after the end of @replacements, don't highlight anything
          402         # (this happens when all words have been replaced and the user only needs to accept the changes)
          403         my $fill_text_buffer = sub {
          404                 my $start = $buffer->get_start_iter();
          405                 my $end = $buffer->get_end_iter();
          406                 $buffer->delete($start, $end);
          407                 my ($contextl, $contextr) = $get_context->();
          408                 $buffer->set_text($contextr);
          409                 if ($cur_replacement <= $#replacements) {
          410                         $start = $buffer->get_start_iter();
          411                         $buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
          412                 }
          413                 $start = $buffer->get_start_iter();
          414                 $buffer->insert($start, $contextl);
          415         };
          416 
          417         my $show_accept = sub {
          418                 $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
          419                 $accept->show;
          420                 $accept->grab_focus;
          421                 $wordlabel->set_text("");
          422                 $skip_button->set_sensitive(FALSE);
          423                 $unknown_button->set_sensitive(FALSE);
          424         };
          425 
          426         my $fill_button_vbox; # forward-declaration so it can be used here already
          427         my $next_word = sub {
          428                 $undo->set_sensitive(TRUE);
          429                 $cur_replacement++;
          430                 $fill_text_buffer->();
          431                 if ($cur_replacement > $#replacements) {
          432                         $show_accept->();
          433                         return;
          434                 }
          435                 $fill_button_vbox->();
          436         };
          437 
          438         # fill $button_vbox with the word options for the current word
          439         $fill_button_vbox = sub {
          440                 $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
          441                 my $word = $replacements[$cur_replacement]->[1];
          442                 $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
          443                 my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
          444                 if (exists $config->{"targetdiacritics"}) {
          445                         # This nasty bit of code finds the number of diacritics in every
          446                         # choice and sorts the choice in descending order based on that
          447                         my %choice_nums;
          448                         foreach my $choice (@choices) {
          449                                 $choice_nums{$choice} = 0;
          450                                 foreach my $diacritic (@{$config->{"targetdiacritics"}}) {
          451                                         my @matches = NFD($choice) =~ /$diacritic/;
          452                                         $choice_nums{$choice} += scalar @matches if @matches;
          453                                 }
          454                         }
          455                         @choices = sort {$choice_nums{$b} <=> $choice_nums{$a}} @choices;
          456                 }
          457                 my $cur_num = 1;
          458                 foreach my $word_choice (@choices) {
          459                         # the mnemonics don't make sense when the number has more than one digit
          460                         my $choice_label = $cur_num <= 9 ? "_$cur_num: $word_choice" : $word_choice;
          461                         my $button = Gtk3::Button->new($choice_label);
          462                         $button->signal_connect(
          463                                 clicked => sub {
          464                                         $replacements[$cur_replacement]->[1] = $word_choice;
          465                                         $next_word->();
          466                                 }, $window);
          467                         $button_vbox->pack_start($button, FALSE, FALSE, 0);
          468                         $button->show;
          469                         $cur_num++;
          470                 }
          471         };
          472 
          473         $undo->signal_connect(
          474                 clicked => sub {
          475                         if ($cur_replacement > 0) {
          476                                 $cur_replacement--;
          477                                 if ($cur_replacement == 0) {
          478                                         $undo->set_sensitive(FALSE);
          479                                 }
          480                                 $replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
          481                                 $fill_button_vbox->();
          482                                 $fill_text_buffer->();
          483                                 $accept->hide;
          484                                 $skip_button->set_sensitive(TRUE);
          485                                 $unknown_button->set_sensitive(TRUE);
          486                                 my $word = $replacements[$cur_replacement]->[1];
          487                                 $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
          488                         }
          489                 }, $window);
          490 
          491         $accept->signal_connect(
          492                 clicked => sub {
          493                         # write the changes to the original $substrings
          494                         foreach (@replacements) {
          495                                 $substrings->[$_->[0]]->[1] = $_->[1];
          496                         }
          497                         $window->destroy;
          498                 }, $window);
          499 
          500         $skip_button->signal_connect(clicked => $next_word, $window);
          501 
          502         $unknown_button->signal_connect(
          503                 clicked => sub {
          504                         $open_unknown = 1;
          505                         $window->destroy;
          506                 }, $window);
          507 
          508         $stop_button->signal_connect(
          509                 clicked => sub {
          510                         $stop = 1;
          511                         $window->destroy;
          512                 }, $window);
          513 
          514         $fill_button_vbox->();
          515         $fill_text_buffer->();
          516 
          517         $window->add($vbox);
          518         $window->show_all;
          519         $accept->hide;
          520         Gtk3->main;
          521         die "Processing stopped at line $cur_lineno\n" if $stop;
          522         if ($open_unknown) {
          523                 my $ret = call_unknown_word_window(
          524                         $substrings, $replacements[$cur_replacement]->[0],
          525                         $config, $args, $cur_lineno);
          526                 # the word choice window still needs to be called again
          527                 # when 0 is returned
          528                 return 3 if $ret == 0;
          529                 return $ret;
          530         }
          531         return 0;
          532 }
          533 
          534 my $ID = 0;
          535 my $STRING = 1;
          536 
          537 # Parse the configuration file into data type (currently only ID and STRING)
          538 sub parse_config {
          539         my $f = shift;
          540         my $fh;
          541         if (!open($fh, "<", $f)) {
          542                 warn "Can't open config file \"$f\"!\n";
          543                 return;
          544         }
          545         my @commands;
          546         my $state = 0;
          547         my $IN_ID = 1;
          548         my $IN_STR = 2;
          549         my $cur_val = "";
          550         while (my $line = <$fh>) {
          551                 chomp($line);
          552                 $state = 0;
          553                 push(@commands, []);
          554                 foreach my $char (split(//, $line)) {
          555                         if ($char eq "#" && !($state & $IN_STR)) {
          556                                 last;
          557                         } elsif ($char eq '"') {
          558                                 if ($state & $IN_STR) {
          559                                         push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
          560                                         $cur_val = "";
          561                                         $state &= ~$IN_STR;
          562                                 } else {
          563                                         $cur_val = "";
          564                                         $state |= $IN_STR;
          565                                 }
          566                         } elsif ($char eq " ") {
          567                                 if ($state & $IN_ID) {
          568                                         push(@{$commands[-1]}, {type => $ID, value => $cur_val});
          569                                         $state &= ~$IN_ID;
          570                                         $cur_val = "";
          571                                 } elsif ($state) {
          572                                         $cur_val .= $char;
          573                                 }
          574                         } else {
          575                                 if (!$state) {
          576                                         $state |= $IN_ID;
          577                                 }
          578                                 $cur_val .= $char;
          579                         }
          580                 }
          581                 if ($state & $IN_STR) {
          582                         warn "ERROR: Unterminated string in config:\n$line";
          583                         return;
          584                 } elsif ($cur_val) {
          585                         push(@{$commands[-1]}, {type => $ID, value => $cur_val});
          586                         $cur_val = "";
          587                 }
          588                 if ($#{$commands[-1]} == -1) {
          589                         pop(@commands);
          590                 }
          591         }
          592         close($fh);
          593 
          594         return \@commands;
          595 }
          596 
          597 # if the path is relative, find its absolute location based
          598 # on the location of the config file
          599 sub open_file_rel_abs {
          600         my ($filename, $config_file, $mode) = @_;
          601         $mode //= "<";
          602         if (!file_name_is_absolute $filename) {
          603                 my $config_dir = dirname $config_file;
          604                 $filename = rel2abs($filename, $config_dir);
          605         }
          606         my $fh;
          607         if (!open $fh, $mode, $filename) {
          608                 warn "Can't open file \"$filename\"!\n";
          609                 return;
          610         }
          611         return $fh;
          612 }
          613 
          614 # Load a file of replacement words into a hash table
          615 sub load_table {
          616         my ($filename, $args, $config, $revert) = @_;
          617         my $fh = open_file_rel_abs $filename, $args->{"config"};
          618         return if !$fh;
          619         my %table;
          620         while (my $line = <$fh>) {
          621                 chomp $line;
          622                 next if (!$line);
          623                 my @words = split(/\Q$config->{tablesep}\E/, $line);
          624                 if (@words != 2) {
          625                         warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
          626                         close $fh;
          627                         return;
          628                 }
          629                 my $word;
          630                 my $replacement;
          631                 if ($revert) {
          632                         $word = NFD $words[1];
          633                         $replacement = $words[0];
          634                 } else {
          635                         $word = NFD $words[0];
          636                         $replacement = $words[1];
          637                 }
          638                 my @word_choices = split /\Q$config->{choicesep}\E/, $word;
          639                 foreach my $word_choice (@word_choices) {
          640                         if (exists $table{$word_choice}) {
          641                                 if ($args->{"checkduplicates"}) {
          642                                         warn "WARNING: Duplicate word in file \"$filename\": " .
          643                                                 "\"$word_choice\", with replacement \"$replacement\", " .
          644                                                 "already exists with replacement \"$table{$word_choice}\".\n";
          645                                 }
          646                                 $table{$word_choice} = get_unique_words(
          647                                         $table{$word_choice} .
          648                                         $config->{choicesep} .
          649                                         $replacement, $config);
          650                         } else {
          651                                 $table{$word_choice} = $replacement;
          652                         }
          653                 }
          654         }
          655         close $fh;
          656         return \%table;
          657 }
          658 
          659 # Load table for words to ignore - only the keys matter, since there is no replacement
          660 sub load_ignore_table {
          661         my ($filename, $args) = @_;
          662         my $fh = open_file_rel_abs $filename, $args->{"config"};
          663         return if !$fh;
          664         my %table;
          665         while (my $line = <$fh>) {
          666                 chomp $line;
          667                 $table{NFD($line)} = "" if $line;
          668         }
          669         close $fh;
          670         return \%table;
          671 }
          672 
          673 # Generate all forms of a word by combining it with endings
          674 # Returns:
          675 # 0 - an error occurred
          676 # 1 - everything's fine
          677 sub expand_table {
          678         my ($table, $forms, $noroot, $config) = @_;
          679         my %new_table;
          680         foreach my $word (keys %$table) {
          681                 foreach my $ending (keys %$forms) {
          682                         # Some words and/or endings have multiple options, separated by $config->{choicesep}
          683                         # These must be temporarily separated in order to properly generate the forms
          684                         my @word_options;
          685                         my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
          686                         my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
          687                         foreach my $stem_option (@stem_options) {
          688                                 foreach my $ending_option (@ending_options) {
          689                                         push(@word_options, $stem_option . $ending_option);
          690                                 }
          691                         }
          692                         $new_table{$word . $ending} = join($config->{choicesep}, @word_options);
          693                 }
          694                 $new_table{$word} = $table->{$word} if !$noroot;
          695         }
          696         return \%new_table;
          697 }
          698 
          699 # Check if the number and types of arguments given to a config command are right
          700 # Returns:
          701 # undef - the arguments don't match
          702 # 1 - the arguments match
          703 sub check_args {
          704         my ($args, $cmd) = @_;
          705         my $cmd_name = $cmd->[0]->{"value"};
          706         if ($#$cmd - 1 < $#$args) {
          707                 my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
          708                 foreach my $arg (@{$cmd}[1..$#$cmd]) {
          709                         $err .= " " . $arg->{"value"}
          710                 }
          711                 warn "$err\n";
          712                 return;
          713         }
          714         my $arg_num = 0;
          715         while ($arg_num <= $#$args) {
          716                 if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
          717                         my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
          718                         $err .= "Expected:";
          719                         foreach my $arg_type (@$args) {
          720                                 $err .= " ID" if ($arg_type == $ID);
          721                                 $err .= " STRING" if ($arg_type == $STRING);
          722                         }
          723                         $err .= "\nReceived:";
          724                         foreach my $arg (@{$cmd}[1..$#$cmd]) {
          725                                 $err .= " ID" if ($arg->{"type"} == $ID);
          726                                 $err .= " STRING" if ($arg->{"type"} == $STRING);
          727                         }
          728                         warn "$err\n";
          729                         return;
          730                 }
          731                 $arg_num++;
          732         }
          733         return 1;
          734 }
          735 
          736 # Interpret the config file - load and expand tables, etc.
          737 # $config_list - the list returned by parse_config
          738 sub interpret_config {
          739         my ($config_list, $args) = @_;
          740         my %tables;
          741         my %config;
          742         # table_paths stores a list of all table and replacement ids that are
          743         # impacted by the path so the replacement can be added on the fly when
          744         # a new replacement is added from the GUI
          745         # the "replacement id" is just the number of the replacement group,
          746         # starting at 0 with the first group in the config
          747         $config{"table_paths"} = {};
          748         # reverted_tables stores a hash of the paths of all tables that are
          749         # reverted so that replacements added from the GUI are added in the
          750         # right order
          751         $config{"reverted_tables"} = {};
          752         # these are the paths of the tables that are displayed in the GUI
          753         $config{"display_tables"} = {};
          754         # a mapping between the table ids and tables for all tables used as
          755         # ending tables in expand statements - so expansions can be done
          756         # on the fly when adding a replacement word from the GUI
          757         $config{"ending_tables"} = {};
          758         # ignore is the path to the ignore file, ignore_words the actual table
          759         $config{"ignore"} = "";
          760         $config{"ignore_words"} = {};
          761         $config{"split"} = "\\s+";
          762         $config{"beforeword"} = "\\s";
          763         $config{"afterword"} = "\\s";
          764         $config{"tablesep"} = "\t";
          765         $config{"choicesep"} = "\$";
          766         # a list of "replacement configs", which specify the type and any
          767         # other arguments (this is given to replace_match, etc.
          768         $config{"replacements"} = [];
          769         # these are temporary mappings used while loading the config
          770         my %path_to_table;
          771         my %table_id_to_path;
          772         my %mandatory_args = (
          773                 "ignore" => [$STRING],
          774                 "table" => [$ID],
          775                 "expand" => [$ID, $ID],
          776                 "match" => [$STRING, $STRING],
          777                 "matchignore" => [$STRING],
          778                 "replace" => [$ID],
          779                 "split" => [$STRING],
          780                 "beforeword" => [$STRING],
          781                 "afterword" => [$STRING],
          782                 "tablesep" => [$STRING],
          783                 "choicesep" => [$STRING],
          784                 "comment" => [$STRING],
          785                 "group" => [],
          786                 "endgroup" => [],
          787                 "retrywithout" => [$STRING],
          788                 "targetdiacritics" => [$STRING]
          789         );
          790         my $in_group = 0;
          791         foreach my $cmd (@$config_list) {
          792                 # All load statements must be before expand statements
          793                 # All expand, beforeword, and afterword statements must be before replace statements
          794                 my $cmd_name = $cmd->[0]->{"value"};
          795                 if ($cmd->[0]->{"type"} == $ID) {
          796                         if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
          797                                 warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
          798                                 return;
          799                         }
          800                         return if !check_args($mandatory_args{$cmd_name}, $cmd);
          801                         if ($cmd_name eq "table") {
          802                                 my $table_path = $cmd->[2]->{"value"};
          803                                 my %table_args;
          804                                 foreach (3..$#$cmd) {
          805                                         $table_args{$cmd->[$_]->{"value"}} = 1;
          806                                 }
          807                                 my $table;
          808                                 # add to temporary path-to-table mapping so tables aren't
          809                                 # loaded unnecessarily
          810                                 if (exists $path_to_table{$table_path}) {
          811                                         $table = $path_to_table{$table_path};
          812                                 } else {
          813                                         $table = load_table $table_path, $args, \%config, $table_args{"revert"};
          814                                         return if !defined $table;
          815                                         $path_to_table{$table_path} = $table;
          816                                 }
          817                                 if ($table_args{"revert"}) {
          818                                         $config{"reverted_tables"}->{$table_path} = 1;
          819                                 }
          820                                 my $table_id = $cmd->[1]->{"value"};
          821                                 $tables{$table_id} = $table;
          822                                 $table_id_to_path{$table_id} = $table_path;
          823                                 # this is a hash to avoid duplicates if the same file
          824                                 # is loaded multiple times
          825                                 $config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"};
          826                         } elsif ($cmd_name eq "expand") {
          827                                 my $orig_table_id = $cmd->[1]->{"value"};
          828                                 my $ending_table_id = $cmd->[2]->{"value"};
          829                                 my $noroot = 0;
          830                                 if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") {
          831                                         $noroot = 1;
          832                                 }
          833                                 if (!exists $tables{$orig_table_id}) {
          834                                         warn "expand: table \"$orig_table_id\" doesn't exist\n";
          835                                         return;
          836                                 } elsif (!exists $tables{$ending_table_id}) {
          837                                         warn "expand: table \"$ending_table_id\" doesn't exist\n";
          838                                         return;
          839                                 }
          840 
          841                                 $config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id};
          842                                 $config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id};
          843                                 push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot];
          844 
          845                                 my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config);
          846                                 return if !$new_table;
          847                                 $tables{$orig_table_id} = $new_table;
          848                         } elsif ($cmd_name eq "group") {
          849                                 if ($in_group) {
          850                                         warn "ERROR: New group started without ending last one in config\n";
          851                                         return;
          852                                 }
          853                                 push @{$config{"replacements"}}, {
          854                                         "type" => "group", "tables" => [],
          855                                         "words" => {}, "options" => {}};
          856                                 # add all options such as "endword" to the options hash
          857                                 for (1..$#$cmd) {
          858                                         $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
          859                                 }
          860                                 $in_group = 1;
          861                         } elsif ($cmd_name eq "endgroup") {
          862                                 if (!$in_group) {
          863                                         warn "ERROR: endgroup command called while not in group\n";
          864                                         return;
          865                                 }
          866                                 $in_group = 0;
          867                         } elsif ($cmd_name eq "match") {
          868                                 if ($in_group) {
          869                                         warn "ERROR: match command is invalid inside group\n";
          870                                         return;
          871                                 }
          872                                 push @{$config{"replacements"}}, {
          873                                         "type" => "match",
          874                                         "options" => {},
          875                                         "search" => NFD($cmd->[1]->{"value"}),
          876                                         "replace" => $cmd->[2]->{"value"}};
          877                                 for (3..$#$cmd) {
          878                                         # add optional arguments as keys in options hash
          879                                         $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
          880                                 }
          881                         } elsif ($cmd_name eq "matchignore") {
          882                                 if ($in_group) {
          883                                         warn "ERROR: matchignore command is invalid inside group\n";
          884                                         return;
          885                                 }
          886                                 push @{$config{"replacements"}}, {
          887                                         "type" => "match",
          888                                         "options" => {},
          889                                         "search" => NFD($cmd->[1]->{"value"})};
          890                                 for (2..$#$cmd) {
          891                                         $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
          892                                 }
          893                         } elsif ($cmd_name eq "replace") {
          894                                 if (!$in_group) {
          895                                         warn "ERROR: replace command called while not in group\n";
          896                                         return;
          897                                 }
          898                                 my $table = $cmd->[1]->{"value"};
          899                                 if (!exists($tables{$table})) {
          900                                         warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
          901                                         return;
          902                                 }
          903 
          904                                 # make a list of all replacements that are affected by this
          905                                 # file so that they can be updated when a word is added
          906                                 # through the gui
          907                                 my $table_path = $table_id_to_path{$table};
          908                                 my $replacement_id = $#{$config{"replacements"}};
          909                                 $config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
          910                                 push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];
          911 
          912                                 # store list of tables for --debug
          913                                 push @{$config{"replacements"}->[-1]->{"tables"}}, $table;
          914 
          915                                 # Note: we don't need to check if $table{"choicesep"} was defined
          916                                 # here since we can't ever get this far without first having
          917                                 # loaded a table anyways
          918                                 my $trie_root = $config{"replacements"}->[-1]->{"words"};
          919                                 my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override";
          920                                 add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override);
          921                         } elsif ($cmd_name eq "retrywithout") {
          922                                 if (!exists $config{"retrywithout"}) {
          923                                         $config{"retrywithout"} = [];
          924                                 }
          925                                 # first value is the display name
          926                                 my @values = map {$_->{"value"}} @{$cmd}[1..$#$cmd];
          927                                 push @{$config{"retrywithout"}}, \@values;
          928                         } elsif ($cmd_name eq "targetdiacritics") {
          929                                 if (!exists $config{$cmd_name}) {
          930                                         $config{$cmd_name} = [];
          931                                 }
          932                                 foreach (1..$#$cmd) {
          933                                         push @{$config{$cmd_name}}, $cmd->[$_]->{"value"};
          934                                 }
          935                         } elsif ($cmd_name eq "split" || $cmd_name eq "beforeword" ||
          936                                 $cmd_name eq "afterword" || $cmd_name eq "tablesep" ||
          937                                 $cmd_name eq "choicesep" || $cmd_name eq "comment") {
          938                                 $config{$cmd_name} = $cmd->[1]->{"value"};
          939                         } elsif ($cmd_name eq "ignore") {
          940                                 $config{"ignore"} = $cmd->[1]->{"value"};
          941                                 my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
          942                                 return if !defined $table;
          943                                 $config{"ignore_words"} = $table;
          944                         } else {
          945                                 warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n";
          946                                 return;
          947                         }
          948                 } else {
          949                         my $err =  "ERROR: line does not start with command:\n";
          950                         foreach my $cmd_part (@$cmd) {
          951                                 $err .= $cmd_part->{"value"};
          952                         }
          953                         warn "$err\n";
          954                         return;
          955                 }
          956         }
          957         if ($in_group) {
          958                 warn "ERROR: unclosed group in config\n";
          959                 return;
          960         }
          961         if (!$config{"ignore"}) {
          962                 warn "ERROR: no file of words to ignore specified.\n";
          963                 return;
          964         }
          965         if ($args->{"dumptables"}) {
          966                 foreach my $table_id (keys %tables) {
          967                         my $table_path = $table_id_to_path{$table_id};
          968                         if ($config{"display_tables"}->{$table_path}) {
          969                                 for my $word (keys %{$tables{$table_id}}) {
          970                                         print NFC($word) . "\n";
          971                                 }
          972                         }
          973                 }
          974         }
          975         return \%config;
          976 }
          977 
          978 # load the config file
          979 # Returns:
          980 # the config hash or undef if an error occurred
          981 sub load_config {
          982         my $args = shift;
          983         my $config_list = parse_config($args->{"config"});
          984         if (!$config_list) {
          985                 return;
          986         }
          987         return interpret_config $config_list, $args;
          988 }
          989 
          990 # Handle the action returned by `prompt_unknown_word`
          991 # $config - the current program config
          992 # $args - the command line arguments
          993 # Currently accepted values for $action:
          994 # ["ignore", "run", $word] - only ignore $word for the rest of this run
          995 # ["ignore", "permanent", $word] - write $word to the permanent ignore file
          996 # ["add", $word, $replace_word, $table_path] - add $word to the table 
          997 #         corresponding to $table_path with $replace_word as its replacement. Note that
          998 #         only tables directly corresponding to paths work here - tables that only
          999 #         were created through "expand" in the config aren't ever shown separately
         1000 #         in `prompt_unknown_word`
         1001 # ["reload"] - reload the configuration file
         1002 # Returns:
         1003 # 0 - nothing needs to be done
         1004 # 1 - the current line needs to be re-transliterated with the new config
         1005 # 2 - an error occurred while reloading the config
         1006 # 3 - stop asking for unknown words on this line
         1007 sub handle_unknown_word_action {
         1008         my ($action, $config, $args) = @_;
         1009         if ($action->[0] eq "ignore") {
         1010                 # yeah, this is a bit messy and inconsistent
         1011                 return 3 if $action->[1] eq "wholeline";
         1012                 $config->{"ignore_words"}->{$action->[2]} = "";
         1013                 if ($action->[1] eq "permanent") {
         1014                         my $fh = open_file_rel_abs $config->{"ignore"}, $args->{"config"}, ">>";
         1015                         return 1 if !$fh;
         1016                         print($fh $action->[2] . "\n");
         1017                         close($fh);
         1018                 } elsif ($action->[1] eq "run") {
         1019                         # Print to error file if ignore isn't permanent
         1020                         return 0 if ($args->{"errors"} eq "");
         1021                         my $fh;
         1022                         if (!open($fh, ">>", $args->{"errors"})) {
         1023                                 warn "ERROR: Can't open error file \"$args->{errors}\".\n";
         1024                                 return 0;
         1025                         }
         1026                         print($fh $action->[2] . "\n");
         1027                         close($fh);
         1028                 }
         1029                 return 0;
         1030         } elsif ($action->[0] eq "add") {
         1031                 my $table_path = $action->[3];
         1032                 my $word = $action->[1];
         1033                 my $replace_word = $action->[2];
         1034                 # make sure to write the words in the correct order if the
         1035                 # tables were reverted while loading
         1036                 my $reverted = exists $config->{"reverted_tables"}->{$table_path};
         1037                 my $word_abs = $reverted ? $action->[2] : $action->[1];
         1038                 my $replace_word_abs = $reverted ? $action->[1] : $action->[2];
         1039                 my $fh = open_file_rel_abs $table_path, $args->{"config"}, ">>";
         1040                 return 1 if !$fh;
         1041                 print($fh $word_abs . $config->{tablesep} . $replace_word_abs . "\n");
         1042                 close($fh);
         1043                 # loop over all table ids that are impacted by this file
         1044                 foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) {
         1045                         my $replacement_id = $replacement->[0];
         1046                         my $table_id = $replacement->[1];
         1047                         my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
         1048                         my $final_table = {$word => $replace_word};
         1049                         # handle expansions
         1050                         foreach my $expand (@{$config->{"expands"}->{$table_id}}) {
         1051                                 my $ending_table_id = $expand->[0];
         1052                                 my $noroot = $expand->[1];
         1053                                 my $endings_table = $config->{"ending_tables"}->{$ending_table_id};
         1054                                 $final_table = expand_table $final_table, $endings_table, $noroot, $config;
         1055                         }
         1056                         add_to_trie($table_id, $trie, $final_table, $args, $config);
         1057                 }
         1058                 return 1;
         1059         } elsif ($action->[0] eq "reload") {
         1060                 my $new_config = load_config $args;
         1061                 if ($new_config) {
         1062                         %$config = %$new_config;
         1063                         return 1;
         1064                 } else {
         1065                         return 2;
         1066                 }
         1067         }
         1068 }
         1069 
         1070 # Split $substrings based on the "split" regex in $config.
         1071 # $substrings can already be split at this point; only the
         1072 # ones that haven't been transliterated yet are modified
         1073 sub split_words {
         1074         my ($config, $substrings) = @_;
         1075         my $split_re = qr/($config->{"split"})/;
         1076         my @substrings_new;
         1077         foreach my $cur_substr (@$substrings) {
         1078                 if ($cur_substr->[0] == 1) {
         1079                         push(@substrings_new, $cur_substr);
         1080                         next;
         1081                 }
         1082 
         1083                 my @words = split(/$split_re/, $cur_substr->[1]);
         1084                 for my $i (0..$#words) {
         1085                         # Word is not delimiter
         1086                         # Split produces an empty field at the beginning if the string
         1087                         # starts with the delimiter
         1088                         if ($i % 2 == 0) {
         1089                                 push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
         1090                         } else {
         1091                                 # Delimiters can count as already replaced
         1092                                 push(@substrings_new, [1, $words[$i], $words[$i]]);
         1093                         }
         1094                 }
         1095         }
         1096         @$substrings = @substrings_new;
         1097 }
         1098 
         1099 # small helper function to add a untransliterated string to the last substring
         1100 # if that is not transliterated yet, or push it onto @$substrings otherwise
         1101 # -> used to keep all untransliterated text in one piece
         1102 # since this is also used for the "nofinal" attribute on "match", it takes
         1103 # an original and replaced string (since, when using "match" and "nofinal",
         1104 # the original string was replaced, but is still marked as unknown)
         1105 sub push_unknown {
         1106         my ($substrings, $orig, $replaced) = @_;
         1107         $replaced //= $orig;
         1108         if (@$substrings && !$substrings->[-1]->[0]) {
         1109                 $substrings->[-1]->[1] .= $replaced;
         1110                 $substrings->[-1]->[2] .= $orig;
         1111         } else {
         1112                 push(@$substrings, [0, $replaced, $orig]);
         1113         }
         1114 }
         1115 
         1116 # Replace a word in $substrings based on $replace_config using regex
         1117 # $replace_config->{"search"} is the word to replace
         1118 # $replace_config->{"replace"} is the replacement word
         1119 #        if $replace_config->{"replace"} is undefined, just splits
         1120 #        $substrings at the the match and marks that the match has
         1121 #        been transliterated - currently used for "matchignore"
         1122 # $replace_config->{"beginword"}, $replace_config->{"afterword"} -
         1123 #        specifies if the match is only valid when $config->{"beforeword"}
         1124 #        or $config->{"afterword"} occur before or after it, respectively
         1125 sub replace_match {
         1126         my ($config, $replace_config, $substrings, $debug_msg) = @_;
         1127         my $beginword = exists $replace_config->{"options"}->{"beginword"};
         1128         my $endword = exists $replace_config->{"options"}->{"endword"};
         1129         my $fullword = $beginword && $endword;
         1130         my $beforeword = $config->{"beforeword"};
         1131         my $afterword = $config->{"afterword"};
         1132         my $word = $replace_config->{"search"};
         1133         my $replace_word = $replace_config->{"replace"};
         1134         if ($fullword) {
         1135                 $word = qr/(\A|$beforeword)$word(\z|$afterword)/;
         1136         } elsif ($beginword) {
         1137                 $word = qr/(\A|$beforeword)$word/;
         1138         } elsif ($endword) {
         1139                 $word = qr/$word(\z|$afterword)/;
         1140         } else {
         1141                 $word = qr/$word/;
         1142         }
         1143 
         1144         my @substrings_new;
         1145         # only modify $substrings at all if the word was found
         1146         my $found_word = 0;
         1147         my $last_idx;
         1148         # @substrings_new is only used if needed to improve efficiency
         1149         foreach my $i (0..$#$substrings) {
         1150                 if ($substrings->[$i]->[0]) {
         1151                         # FIXME: is there a way to make it more efficient by keeping the old array?
         1152                         # This is a major bottleneck
         1153                         # Note: the above statement *may* be a bit exaggerated
         1154                         if ($found_word) {
         1155                                 push(@substrings_new, $substrings->[$i]);
         1156                         }
         1157                         next;
         1158                 }
         1159                 $last_idx = 0;
         1160                 my $i0 = 0;
         1161                 my $i1 = 0;
         1162                 while ($substrings->[$i]->[1] =~ m/$word/g) {
         1163                         if (!$found_word) {
         1164                                 print $debug_msg if $debug_msg;
         1165                                 $found_word = 1;
         1166                                 if ($i != 0) {
         1167                                         push(@substrings_new, @{$substrings}[0..$i-1]);
         1168                                 }
         1169                         }
         1170                         # This mess is needed to reliably match $beforeword and $afterword and put the captured
         1171                         # "splitting" characters back into the text. This would be much easier just using
         1172                         # a lookbehind and lookahead, but I couldn't find a way to also match beginning and
         1173                         # end of string that way.
         1174                         $i0 = $-[0];
         1175                         $i1 = $+[0];
         1176                         if ($fullword) {
         1177                                 $i0 += length($1);
         1178                                 $i1 -= length($2);
         1179                                 # pos need to be decreased so that matches still work right next to each other
         1180                                 pos($substrings->[$i]->[1]) -= length($2);
         1181                         } elsif ($beginword) {
         1182                                 $i0 += length($1);
         1183                         } elsif ($endword) {
         1184                                 $i1 -= length($1);
         1185                                 pos($substrings->[$i]->[1]) -= length($1);
         1186                         }
         1187                         if ($last_idx != $i0) {
         1188                                 my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
         1189                                 push_unknown \@substrings_new, $unknown;
         1190                         }
         1191                         my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
         1192                         my $replace_str = $replace_word // $orig_str;
         1193                         if ($replace_config->{"options"}->{"nofinal"}) {
         1194                                 warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
         1195                                 push_unknown \@substrings_new, $orig_str, $replace_str;
         1196                         } else {
         1197                                 warn "Replaced \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
         1198                                 push(@substrings_new, [1, $replace_str, $orig_str]);
         1199                         }
         1200                         $last_idx = $i1;
         1201                 }
         1202                 if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
         1203                         my $unknown = substr($substrings->[$i]->[1], $last_idx);
         1204                         push_unknown \@substrings_new, $unknown;
         1205                 }
         1206         }
         1207         if ($found_word) {
         1208                 @$substrings = @substrings_new;
         1209         }
         1210 }
         1211 
         1212 # Replace a group, i.e. replace all the words in a trie
         1213 # $replace_config->{"words"} - the root node of the trie
         1214 # $replace_config->{"beginword"}, $replace_config->{"endword"} -
         1215 #        same as in `replace_match`
         1216 sub replace_group {
         1217         my ($config, $replace_config, $substrings, $debug_msg) = @_;
         1218         my @substrings_new;
         1219         my $word_found = 0;
         1220         # Recurse backwords towards the root node of the trie to find the first
         1221         # node with a key "final" which satisfies the ending condition (if "endword" is set)
         1222         # Returns the id *after* the last match and the node that was found
         1223         # with the key "final" (or undef, if nothing matched)
         1224         my $find_final = sub {
         1225                 my ($i, $tmp_cur_node, $s) = @_;
         1226                 do {
         1227                         my $after_ch = substr($s->[1], $i, 1);
         1228                         if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) ||
         1229                                 $after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
         1230                                 return ($i, $tmp_cur_node);
         1231                         }
         1232                         $i--;
         1233                 } while ($tmp_cur_node = $tmp_cur_node->{"parent"});
         1234                 # none of the points were appropriate for breaking the word, so
         1235                 # $tmp_cur_node now points to the nonexistent parent node of the
         1236                 # root node
         1237                 return ($i, undef);
         1238         };
         1239         foreach my $s (@$substrings) {
         1240                 if ($s->[0]) {
         1241                         push(@substrings_new, $s);
         1242                         next;
         1243                 }
         1244                 my $cur_node = $replace_config->{"words"};
         1245                 my $start_i = 0;
         1246                 my $i = 0;
         1247                 # This deliberately goes off the end of the string! $cur_node is always "one behind" $i
         1248                 # since the node is only advanced in the iteration *after* $i has increased, meaning that
         1249                 # $i has to already be after the end of the string for the first if statement to definitely
         1250                 # fail, causing the elsif statement to handle that case
         1251                 while ($i <= length($s->[1])) {
         1252                         # This works even when $i is one index after the end of the string - it just returns "" then
         1253                         my $ch = substr($s->[1], $i, 1);
         1254                         if (exists $cur_node->{$ch}) {
         1255                                 if ($cur_node == $replace_config->{"words"}) {
         1256                                         my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
         1257                                         if (exists($replace_config->{"options"}->{"beginword"}) &&
         1258                                                 $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
         1259                                                 push_unknown \@substrings_new, $ch;
         1260                                                 $i++;
         1261                                                 next;
         1262                                         }
         1263                                         $start_i = $i;
         1264                                 }
         1265                                 $cur_node = $cur_node->{$ch};
         1266                         } elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
         1267                                 my $tmp_cur_node = $cur_node;
         1268                                 ($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
         1269                                 if (!defined($tmp_cur_node)) {
         1270                                         push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
         1271                                         $i += 2;
         1272                                 } else {
         1273                                         my $orig = substr($s->[1], $start_i, $i-$start_i);
         1274                                         my $final = $tmp_cur_node->{"final"};
         1275                                         if ($debug_msg) {
         1276                                                 if (!$word_found) {
         1277                                                         warn $debug_msg;
         1278                                                         $word_found = 1;
         1279                                                 }
         1280                                                 warn "Replaced \"$orig\" with \"$final\"\n";
         1281                                         }
         1282                                         push(@substrings_new, [1, $final, $orig]);
         1283                                 }
         1284                                 $cur_node = $replace_config->{"words"};
         1285                                 next;
         1286                         } else {
         1287                                 push_unknown \@substrings_new, $ch;
         1288                         }
         1289                         $i++;
         1290                 }
         1291         }
         1292         @$substrings = @substrings_new;
         1293 }
         1294 
         1295 # Perform all replacements on $word, first removing all
         1296 # characters specified in $chars
         1297 sub replace_strip_chars {
         1298         my ($config, $args, $chars, $word) = @_;
         1299         foreach my $char (@$chars) {
         1300                 $word =~ s/\Q$char\E//g;
         1301         }
         1302         return replace_line($config, $args, $word);
         1303 }
         1304 
         1305 # Perform all replacements on $line based on $config
         1306 # $substrings: array of arrays - each one has three elements:
         1307 # first 0 or 1, indicating if the substring following it has already
         1308 # been replaced or not (1 means it has been replaced), then the
         1309 # transliterated string, and lastly the original string.
         1310 # If the first element is 0, the second two elements are obviously same
         1311 sub replace_line {
         1312         my ($config, $args, $line) = @_;
         1313         my $substrings = [[0, $line, $line]];
         1314         foreach my $replacement (@{$config->{"replacements"}}) {
         1315                 if ($replacement->{"type"} eq "match") {
         1316                         my $debug_msg;
         1317                         if ($args->{"debug"}) {
         1318                                 my $options = join " ", keys(%{$replacement->{"options"}});
         1319                                 $debug_msg =  "Match ($options): \"$replacement->{search}\"";
         1320                                 if ($replacement->{"replace"}) {
         1321                                         $debug_msg .= " \"$replacement->{replace}\"\n";
         1322                                 } else {
         1323                                         $debug_msg .= " (ignore)\n";
         1324                                 }
         1325                         }
         1326                         replace_match($config, $replacement, $substrings, $debug_msg);
         1327                 } elsif ($replacement->{"type"} eq "group") {
         1328                         my $debug_msg;
         1329                         if ($args->{"debug"}) {
         1330                                 my $options = join " ", keys(%{$replacement->{"options"}});
         1331                                 my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"';
         1332                                 $debug_msg = "Group ($options): $tables\n";
         1333                         }
         1334                         replace_group($config, $replacement, $substrings, $debug_msg);
         1335                 }
         1336         }
         1337         # splits all words at the end so that the splitting characters
         1338         # aren't taken as unknown words and the unknown words are (hopefully)
         1339         # in better chunks for prompting the user about them
         1340         split_words($config, $substrings);
         1341 
         1342         return $substrings;
         1343 }
         1344 
         1345 # Call the unknown word window with the given substrings and index
         1346 # See `get_unknown_words` for explanation of other parameters
         1347 # (should be obvious)
         1348 # Returns:
         1349 # 3, if the rest of the line should be skipped
         1350 # 1, if the line needs to be re-transliterated
         1351 # 0, if nothing needs to be done
         1352 sub call_unknown_word_window {
         1353         my ($substrings, $i, $config, $args, $cur_lineno) = @_;
         1354         my $word = $substrings->[$i];
         1355         my $contextl = "";
         1356         my $contextl_orig = "";
         1357         foreach my $j (0..$i-1) {
         1358                 $contextl .= $substrings->[$j]->[1];
         1359                 $contextl_orig .= $substrings->[$j]->[2];
         1360         }
         1361         my $contextr = "";
         1362         my $contextr_orig = "";
         1363         foreach my $j ($i+1..$#$substrings) {
         1364                 $contextr .= $substrings->[$j]->[1];
         1365                 $contextr_orig .= $substrings->[$j]->[2];
         1366         }
         1367         my $action = prompt_unknown_word($contextl, $contextl_orig,
         1368                 $word->[1], $word->[2], $contextr, $contextr_orig,
         1369                 $config, "$cur_lineno", $args);
         1370         # if $ret == 3, rest of line should be skipped
         1371         # if $ret == 2, config could not be loaded
         1372         # if $ret == 1, line must be redone with new config
         1373         my $ret = handle_unknown_word_action($action, $config, $args);
         1374         # keep retrying until the user chooses an action which
         1375         # didn't throw an error
         1376         while ($ret == 2) {
         1377                 $action = prompt_unknown_word($contextl, $contextl_orig,
         1378                         $word->[1], $word->[2], $contextr, $contextr_orig,
         1379                         $config, "$cur_lineno", $args, 1);
         1380                 $ret = handle_unknown_word_action($action, $config, $args);
         1381         }
         1382         return $ret;
         1383 }
         1384 
         1385 # NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
         1386 # If an ignore word is added which is attached to a word that should have a replacement
         1387 # added and just that word is selected to ignore, you never get a chance to add a
         1388 # replacement for the other word that it is attached to
         1389 
         1390 # NOTE: This is very ugly code. The GUI code is the worst, but this whole part
         1391 # of the program is nasty. This is partially due to the fact that features kept
         1392 # being added when their use was discovered. This problem might be fixed in the
         1393 # future when I have time to rewrite all of this.
         1394 
         1395 # Handle unknown words
         1396 # $substrings - the current substrings with unknown words
         1397 # $config - the program config
         1398 # $args - the command line args
         1399 # $cur_lineno - display string to show the user the current line number
         1400 # Returns:
         1401 # 1 - the line needs to be re-transliterated
         1402 # 0 - all done
         1403 sub get_unknown_words {
         1404         my ($substrings, $config, $args, $cur_lineno) = @_;
         1405         foreach my $i (0 .. $#$substrings) {
         1406                 my $word = $substrings->[$i];
         1407                 if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
         1408                         my $ret = call_unknown_word_window $substrings, $i, $config, $args, $cur_lineno;
         1409                         # 3 means we ignore the line
         1410                         if ($ret == 3) {
         1411                                 foreach my $s (@$substrings) {
         1412                                         # revert all changes done on the line
         1413                                         $s->[1] = $s->[2];
         1414                                 }
         1415                                 return 0;
         1416                         }
         1417                         # 1 means the line needs to be re-transliterated
         1418                         return 1 if $ret == 1;
         1419                 }
         1420                 $i++;
         1421         }
         1422         return 0;
         1423 }
         1424 
         1425 # Main replacement function
         1426 # Opens the input file ($args->{"input"}) and writes the transliterated text
         1427 # to the file handle $outputfh, prompting the user for unknown words or
         1428 # word choices (if those aren't disabled on the command line)
         1429 sub replace {
         1430         my ($config, $args, $total_lines, $inputfh, $outputfh) = @_;
         1431         while (my $line = <$inputfh>) {
         1432                 next if $. < $args->{"start"};
         1433                 my $comment;
         1434                 if (exists $config->{"comment"}) {
         1435                         $line =~ s/\Q$config->{comment}\E(.*)\z//s;
         1436                         $comment = $1;
         1437                 }
         1438                 my $nfd_line = NFD($line);
         1439                 my $substrings = replace_line($config, $args, $nfd_line);
         1440 
         1441                 if (!$args->{"nounknowns"}) {
         1442                         # re-transliterate the string if the config was reloaded
         1443                         while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
         1444                                 $substrings = replace_line($config, $args, $nfd_line);
         1445                         }
         1446                 } elsif ($args->{"debugspecial"}) {
         1447                         foreach my $s (@$substrings) {
         1448                                 if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
         1449                                         warn "Unknown word: \"$s->[1]\"\n";
         1450                                 }
         1451                         }
         1452                 }
         1453                 if (!$args->{"nochoices"}) {
         1454                         # this only loops more than once if the user presses the button
         1455                         # "Open in unknown word window"
         1456                         while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) {
         1457                                 if ($ret == 1) {
         1458                                         $substrings = replace_line($config, $args, $nfd_line);
         1459                                 }
         1460                         }
         1461                 } elsif ($args->{"debugspecial"}) {
         1462                         foreach my $s (@$substrings) {
         1463                                 if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
         1464                                         my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
         1465                                         warn "Word \"$s->[1]\" with $num_choices word choices.\n";
         1466                                 }
         1467                         }
         1468                 }
         1469 
         1470                 foreach (@$substrings) {
         1471                         print $outputfh $_->[1];
         1472                 }
         1473                 print $outputfh $comment if $comment;
         1474         }
         1475 }
         1476 
         1477 my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
         1478 GetOptions(
         1479         \%args, "debug", "debugspecial",
         1480         "nochoices", "nounknowns",
         1481         "force", "start=i",
         1482         "output=s", "config=s",
         1483         "errors=s", "help",
         1484         "checkduplicates", "dumptables") or pod2usage(1);
         1485 
         1486 pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
         1487 pod2usage(-exitval => 1, -verbose => 1) if @ARGV > 1;
         1488 
         1489 if (!-f $args{"config"}) {
         1490         die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
         1491 }
         1492 my $config = load_config \%args;
         1493 if (!$config) {
         1494         die "ERROR: Invalid config\n";
         1495 }
         1496 exit 0 if ($args{"checkduplicates"} || $args{"dumptables"});
         1497 
         1498 my $inputfh;
         1499 my $total_lines = "UNKNOWN";
         1500 if (@ARGV < 1) {
         1501         warn "WARNING: no input file supplied; taking input from STDIN\n";
         1502         $inputfh = \*STDIN;
         1503 } else {
         1504         open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
         1505         # Is there *really* no more efficient way to get the total number of lines?
         1506         $total_lines = 0;
         1507         while (<$inputfh>) {$total_lines++};
         1508         close $inputfh;
         1509         open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
         1510 }
         1511 
         1512 if (-f $args{"errors"} && !$args{"force"}) {
         1513         my $choice = "";
         1514         while ($choice !~ /^[yn]$/) {
         1515                 print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
         1516                 $choice = <STDIN>;
         1517                 chomp $choice;
         1518         }
         1519         die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
         1520 }
         1521 
         1522 my $outputfh;
         1523 if ($args{"output"} eq "") {
         1524         warn "WARNING: no output file supplied; printing to STDOUT\n";
         1525         $outputfh = \*STDOUT;
         1526 } elsif (-f $args{"output"} && !$args{"force"}) {
         1527         my $choice = "";
         1528         while ($choice !~ /^[aoe]$/) {
         1529                 print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
         1530                 $choice = <STDIN>;
         1531                 chomp $choice;
         1532         }
         1533         if ($choice eq "a") {
         1534                 open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
         1535         } elsif ($choice eq "e") {
         1536                 die "ERROR: \"$args{output}\" already exists.\n";
         1537         } else {
         1538                 open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
         1539         }
         1540 } else {
         1541         open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
         1542 }
         1543 
         1544 replace($config, \%args, $total_lines, $inputfh, $outputfh);
         1545 close $inputfh;
         1546 close $outputfh;
         1547 
         1548 __END__
         1549 
         1550 =head1 NAME
         1551 
         1552 transliterate.pl - Transliterate text files
         1553 
         1554 =head1 SYNOPSIS
         1555 
         1556 transliterate.pl [options][input file]
         1557 
         1558 Start the transliteration engine with the given file as input.
         1559 The input file defaults to STDIN if no filename is given.
         1560 
         1561 =head1 OPTIONS
         1562 
         1563 =over 8
         1564 
         1565 =item B<--output> <filename>
         1566 
         1567 Sets the output file to print to.
         1568 
         1569 If the file exists already and B<--force> is not set, the user is asked
         1570 if the file should be overwritten or appended to.
         1571 
         1572 B<Default:> C<STDOUT> (print to terminal)
         1573 
         1574 =item B<--config> <filename>
         1575 
         1576 Sets the configuration file to use.
         1577 
         1578 B<Default:> C<config>
         1579 
         1580 =item B<--checkduplicates>
         1581 
         1582 Prints all duplicate words within single table files and across tables
         1583 that are replaced within the same group, then exits the program.
         1584 
         1585 Note that this simply prints B<all> duplicates, even ones that are
         1586 legitimate. When duplicates are found during normal operation of
         1587 the program, they are simply combined in exactly the same way as the
         1588 regular word choices.
         1589 
         1590 Also note that the words are still added as possible choices, which
         1591 may be slightly confusing. If, for instance, a word "word" is stored
         1592 in the tables "tablea", "tableb", and "tablec" with the replacements
         1593 "a", "b", and "c", the first duplicate message will say that the
         1594 first occurrence was in table "tablea" with the replacement "a", and
         1595 the second duplicate message will say that the first occurrence was
         1596 in table "tablea" with the replacement "a$b" (assuming $ is the
         1597 value set as B<choicesep> in the config). This is just something to
         1598 be aware of.
         1599 
         1600 On that note, before duplicates are checked between tables in the
         1601 same replacement group, duplicates inside the same file are already
         1602 replaced, so that might be a bit confusing as well.
         1603 
         1604 =item B<--dumptables>
         1605 
         1606 Prints the words of all tables that don't have B<nodisplay> set.
         1607 
         1608 This is mainly meant to be used for generating word lists in order to
         1609 use them in a spell checker. Note that the words printed here are in
         1610 UTF-8 NFC (Unicode Canonical Composition Form), so it may not be ideal
         1611 when the spellchecked text is not in the same form.
         1612 
         1613 =item B<--nochoices>
         1614 
         1615 Disables prompting for the right word when multiple replacement words exist.
         1616 
         1617 This can be used to "weed out" all the unknown words before
         1618 commencing the laborious task of choosing the right word every time
         1619 multiple options exist.
         1620 
         1621 =item B<--nounknowns>
         1622 
         1623 Disables prompting for the right word when a word is not found in the database.
         1624 
         1625 This can be used together with B<--nochoices> to perform a quick test of how
         1626 well the actual engine is working without having to click through all the
         1627 prompts.
         1628 
         1629 =item B<--debug>
         1630 
         1631 Prints information helpful for debugging problems with the B<match> and B<group>
         1632 statements.
         1633 
         1634 For each B<match> or B<group> statement which replaces anything, the original
         1635 statement is printed (the format is a bit different than in the config) and
         1636 each actual word that's replaced is printed.
         1637 
         1638 =item B<--debugspecial>
         1639 
         1640 This option is only useful for automatic testing of the transliteration engine.
         1641 
         1642 If B<--nochoices> is enabled, each word in the input with multiple choices will
         1643 be output, along with the number of choices (can be used to test the proper
         1644 functioning of B<choicesep> in the config file).
         1645 
         1646 If B<--nounknowns> is enabled, each unknown word in the input is printed
         1647 (can be used to test that the B<ignore> options are working correctly).
         1648 
         1649 =item B<--force>
         1650 
         1651 Always overwrites the output and error file without asking.
         1652 
         1653 =item B<--start> <line number>
         1654 
         1655 Starts at the given line number instead of the beginning of the file.
         1656 
         1657 Note: when "Stop processing" is pressed, the current line number is
         1658 printed out. This is the current line that was being processed, so it
         1659 has not been printed to the output file yet and thus the program must
         1660 be resumed at that line, not the one afterwards.
         1661 
         1662 =item B<--errors> <filename>
         1663 
         1664 Specifies a file to write errors in. Note that this does not refer to
         1665 actual errors, but to any words that were temporarily ignored
         1666 (i.e. words for which "Ignore: This run" was clicked).
         1667 
         1668 If no file is specified, nothing is written. If a file is specified
         1669 that already exists and B<--force> is not set, the user is prompted
         1670 for action.
         1671 
         1672 =item B<--help>
         1673 
         1674 Displays the full documentation.
         1675 
         1676 =back
         1677 
         1678 =head1 DESCRIPTION
         1679 
         1680 B<transliterate.pl> will read the given input file and transliterate it
         1681 based on the given configuration file, prompting the user for action if
         1682 a word has multiple replacement options or is not found in the database.
         1683 
         1684 See L</"CONFIGURATION"> for details on what is possible.
         1685 
         1686 Note that this is B<not> some sort of advanced transliteration engine
         1687 which understands the grammar of the language and tries to guess words
         1688 based on that. This is only a glorified find-and-replace program
         1689 with some extra features to make it useful for transliterating text
         1690 using large wordlists.
         1691 
         1692 WARNING: All input data is assumed to be UTF-8!
         1693 
         1694 =head1 WORD CHOICE WINDOW
         1695 
         1696 The word choice window is opened any time one word has multiple replacement
         1697 options and prompts the user to choose one.
         1698 
         1699 For each word with multiple options, the user must choose the right option
         1700 and then press "Accept changes" to finalize the transliteration of the
         1701 current line. The button to accept changes is selected by default, so it
         1702 is possible to just press enter instead of manually clicking it. Before the
         1703 line is finalized, the user may press "Undo" to undo any changes on the
         1704 current line.
         1705 
         1706 "Skip word" just leaves it as is. This shouldn't be needed in most cases
         1707 since B<choicesep> should always be set to a character that doesn't occur
         1708 normally in the text anyways.
         1709 
         1710 "Open in unknown word window" will open the
         1711 L<unknown word window|/"UNKNOWN WORD WINDOW"> with the current word
         1712 selected. This is meant as a helper if you notice that another word choice
         1713 needs to be added.
         1714 
         1715 Warning: This is very inconsistent and buggy! Since the unknown word window
         1716 is just opened directly, it isn't modified to make more sense for this
         1717 situation. Whenever "Add replacement" is pressed, the whole line is
         1718 re-transliterated as usual, but the word choice window is opened again
         1719 right afterwards. If you just want to go back to the word choice window,
         1720 press the ignore button for "whole line" since that shouldn't break
         1721 anything. There are weird inconsistencies, though - for instance, if you
         1722 delete all words in the tables, then press "Reload config", the line will
         1723 be re-transliterated and none of the words will actually be found, but it
         1724 will still go on because control passes back to the word choice window no
         1725 matter what. Also, none of the word choices that were already done on this
         1726 line are saved since the line is restarted from the beginning. As I said,
         1727 it's only there as a helper and is very buggy/inconsistent. Maybe I'll make
         1728 everything work better in a future release.
         1729 
         1730 "Stop processing" will exit the program and print the line number that was
         1731 currently being processed.
         1732 
         1733 =head1 UNKNOWN WORD WINDOW
         1734 
         1735 The unknown word window is opened any time a word could not be replaced.
         1736 
         1737 Both the context from the original script and the context from the
         1738 transliterated version (so far) is shown. If a part of the text is
         1739 selected in one of the text boxes and "Use selection as word" is
         1740 pressed for the appropriate box, the selected text is used for the
         1741 action that is taken subsequently. "Reset text" resets the text in
         1742 the text box to its original state (except for the highlight because
         1743 I'm too lazy to do that).
         1744 
         1745 The possible actions are:
         1746 
         1747 =over 8
         1748 
         1749 =item Ignore
         1750 
         1751 "This run" only ignores the word until the program exits, while
         1752 "Permanently" saves the word in the ignore file specified in the
         1753 configuration. "Whole line" stops asking for unknown words on
         1754 this line and prints the line out as it originally was in the
         1755 file. Note that any words in the original line that contain
         1756 B<choicesep> will still cause the L<word choice window|/"WORD CHOICE WINDOW">
         1757 to appear due to the way it is implemented. Just press "Skip word"
         1758 if that happens.
         1759 
         1760 =item Retry without <display name>
         1761 
         1762 Removes all characters specified in the corresponding B<retrywithout>
         1763 statement in the L<config|/"CONFIGURATION">
         1764 from the currently selected word and re-transliterates just that
         1765 word. The result is then pasted into the text box beside
         1766 "Add replacement" so it can be added to a table. This is only a
         1767 sort of helper for languages like Urdu in which words often can
         1768 be written with or without diacritics. If the "base form" without
         1769 diacritics is already in the tables, this button can be used to
         1770 quickly find the transliteration instead of having to type it
         1771 out again. Any part of the word that couldn't be transliterated
         1772 is just pasted verbatim into the text box (but after the
         1773 characters have been removed).
         1774 
         1775 Note that the selection can still be modified after this, before
         1776 pressing "Add to list". This could potentially be useful if a word
         1777 is in a table that is expanded using "noroot" because for instance
         1778 "Retry without diacritics" would only work with the full word (with
         1779 the ending), but only the stem should be added to the list. If that
         1780 is the case, "Retry without diacritics" could be pressed with the
         1781 whole word selected, but the ending could be removed before actually
         1782 pressing "Add to list".
         1783 
         1784 A separate button is shown for every B<retrywithout> statement
         1785 in the config.
         1786 
         1787 =item Add to list
         1788 
         1789 Adds the word typed in the text box beside "Add replacement" to the
         1790 selected table file as the replacement for the word currently selected
         1791 and re-runs the replacement on the current line. All table files that
         1792 do not have B<nodisplay> set are shown as options, see L</"CONFIGURATION">.
         1793 
         1794 Warning: This simply appends the word and its replacement to the end
         1795 of the file, so it will cause an error if there was no newline
         1796 ("\n") at the end of the file before.
         1797 
         1798 Note that this always re-transliterates the entire line afterwards.
         1799 This is to allow more flexibility. Consider, for instance, a compound
         1800 word of which the first part is also a valid single word. If the
         1801 entire line was not re-transliterated, it would be impossible to
         1802 add a replacement for that entire compound word and have it take
         1803 effect during the same run since the first part of the word would
         1804 not even be available for transliteration anymore.
         1805 
         1806 One problem is that the word is just written directly to the file
         1807 and there is no undo. This is the way it currently is and will
         1808 probably not change very soon. If a mistake is made, the word can
         1809 always be removed again manually from the list and "Reload config"
         1810 pressed.
         1811 
         1812 =item Reload config
         1813 
         1814 Reload the configuration file along with all tables an re-runs the
         1815 replacement on the current line. Note that this can take a short
         1816 while since the entire word database has to be reloaded.
         1817 
         1818 =item Stop processing
         1819 
         1820 Prints the current line number to the terminal and exits the program.
         1821 
         1822 The program can always be started again at this line number using
         1823 the B<--start> option if needed.
         1824 
         1825 =back
         1826 
         1827 =head1 INTERNALS/EXAMPLES
         1828 
         1829 This section was added to explain to the user how the transliteration
         1830 process works internally since that may be necessary to understand
         1831 why certain words are replaced the way they are.
         1832 
         1833 First off, the process works line-by-line, i.e. no B<match> or B<replace>
         1834 statement will ever match anything that crosses the end of a line.
         1835 
         1836 Each line is initially stored as one chunk which is marked as
         1837 untransliterated. Then, all B<match>, B<matchignore>, and B<replace>
         1838 (or, rather, B<group>) statements are executed in the order they
         1839 appear in the config file. Whenever a word/match is replaced, it
         1840 is split off into a separate chunk which is marked as transliterated.
         1841 A chunk marked as transliterated I<is entirely ignored by any
         1842 replacement statements that come afterwards>. Note that B<beginword>
         1843 and B<endword> can always match at the boundary between an
         1844 untransliterated and transliterated chunk. This is to facilitate
         1845 automated replacement of certain grammatical constructions. For instance:
         1846 
         1847 If the string "a-" could be attached as a prefix to any word and needed
         1848 to be replaced as "b-" everywhere, it would be quite trivial to add
         1849 a match statement C<'match "a-" "b-" beginword'>. If run on the text
         1850 "a-word", where "word" is some word that should be transliterated
         1851 as "word_replaced", and the group replace statement for the word comes
         1852 after the match statement given above, the following would happen:
         1853 First, the match statement would replace "a-" and split the text into
         1854 the two chunks "b-" and "word", where "b-" is already marked as
         1855 transliterated. Since "word" is now separate, it will be matched
         1856 by the group replace statement later, even if it has B<beginword> set
         1857 and would normally not match if "a-" came before it. Thus, the final
         1858 output will be "b-word_replaced", allowing for the uniform replacement
         1859 of the prefix instead of having to add each word twice, once with and
         1860 once without the prefix.
         1861 
         1862 In certain cases, this behavior may not be desired. Consider, for
         1863 instance, a prefix "c-" which cannot be replaced uniformly as in the
         1864 example above due to differences in the source and destination script.
         1865 Since it cannot be replaced uniformly, two words "word1" and "word2"
         1866 would both need to be specified separately with replacements for
         1867 "c-word1" and "c-word2". If, however, the prefix "c-" has an
         1868 alternate spelling "c " (without the hyphen), it would be very useful
         1869 to be able to automatically recognize that as well. This is where the
         1870 B<nofinal> attribute for the B<match> statements comes in. If there is
         1871 a match statement C<'match "c " "c-" beginword nofinal'>, the replaced
         1872 chunk is B<not> marked as transliterated, so after executing this
         1873 statement on the text "c word1", there will still only be one chunk,
         1874 "c-word1", allowing for the regular word replacements to function
         1875 properly.
         1876 
         1877 Once all the replacement statements have been processed, each chunk
         1878 of text that is not marked as transliterated yet is split based on
         1879 the B<split> pattern specified in the config and all actual characters
         1880 matched by the B<split> pattern are marked as transliterated (this
         1881 usually means all the spaces, newlines, quotation marks, etc.). Any
         1882 remaining words/text chunks that are still marked as untransliterated are
         1883 now processed by the unknown word window. If one of these remaining
         1884 unknown chunks is present in the file specified by the B<ignore>
         1885 statement in the config, it is simply ignored and later printed out
         1886 as is. After all untransliterated words have either had a replacement
         1887 added or been ignored, any words with multiple replacement choices are
         1888 processed by the word choice window. Once this is all done, the final
         1889 output is written to the output file and the process is repeated with
         1890 the next line. Note that the entire process is started again each time
         1891 a word is added to a table or the config is reloaded from the
         1892 L<unknown word window|/"UNKNOWN WORD WINDOW">.
         1893 
         1894 =head1 CONFIGURATION
         1895 
         1896 These are the commands accepted in the configuration file.
         1897 Any parameters in square brackets are optional.
         1898 Comments are started with C<#>. Strings (filenames, regex strings, etc.)
         1899 are enclosed in double quotes ("").
         1900 
         1901 The B<match>, B<matchignore>, and B<replace> commands are executed in
         1902 the order they are specified, except that all B<replace> commands within
         1903 the same group are replaced together.
         1904 
         1905 The B<match> and B<matchignore> statements accept any RegEx strings and
         1906 are thus very powerful. The B<group> statements only work with the
         1907 non-RegEx words from the tables, but are very efficient for large numbers
         1908 of words and should thus be used for the main bulk of the words.
         1909 
         1910 Any duplicate words found will cause the user to be prompted to choose
         1911 one option every time the word is replaced in the input text.
         1912 
         1913 Note that any regex strings specified in the config should B<not>
         1914 contain capture groups, as that would break the B<endword> functionality
         1915 since this is also implemented internally using capture groups. Capture
         1916 groups are also entirely pointless in the config since they currently
         1917 cannot be used as part of the replacement string in B<match> statements.
         1918 Lookaheads and lookbehinds are fine, though, and could be useful in
         1919 certain cases.
         1920 
         1921 All tables must be loaded before they are used, or there will be an
         1922 error that the table does not exist.
         1923 
         1924 Warning: If a B<replace> statement is located before an B<expand>
         1925 statement that would have impacted the table used, there will be no
         1926 error but the expand statement won't have any impact.
         1927 
         1928 Basic rule of thumb: Always put the B<table> statements before the
         1929 B<expand> statements and the B<expand> statements before the B<replace>
         1930 statements.
         1931 
         1932 =over 8
         1933 
         1934 =item B<split> <regex string>
         1935 
         1936 Sets the RegEx string to be used for splitting words. This is only used
         1937 for splitting the words which couldn't be replaced after all replacement
         1938 has been done, before prompting the user for unknown words.
         1939 
         1940 Note that B<split> should probably always contain at least C<\n>, since
         1941 otherwise all of the newlines will be marked as unknown words. Usually,
         1942 this will be included anyways through C<\s>.
         1943 
         1944 Note also that B<split> should probably include the C<+> RegEx-quantifier
         1945 since that allows the splitting function in the end to ignore several
         1946 splitting characters right after each other (e.g. several spaces) in one
         1947 go instead of splitting the string again for every single one of them.
         1948 This shouldn't actually make any difference functionality-wise, though.
         1949 
         1950 B<Default:> C<\s+> (all whitespace)
         1951 
         1952 =item B<beforeword> <regex string>
         1953 
         1954 Sets the RegEx string to be matched before a word if B<beginword> is set.
         1955 
         1956 B<Default:> C<\s>
         1957 
         1958 =item B<afterword> <regex string>
         1959 
         1960 Sets the RegEx string to be matched after a word if B<endword> is set.
         1961 
         1962 Note that B<afterword> should probably always contain at least C<\n>,
         1963 since otherwise words with B<endword> set will not be matched at the
         1964 end of a line.
         1965 
         1966 B<beforeword> and B<afterword> will often be exactly the same, but
         1967 they are left as separate options in case more fine-tuning is needed.
         1968 
         1969 B<Default:> C<\s>
         1970 
         1971 =item B<tablesep> <string>
         1972 
         1973 Sets the separator used to split the lines in the table files into the
         1974 original and replacement word.
         1975 
         1976 B<Default:> C<Tab>
         1977 
         1978 =item B<choicesep> <string>
         1979 
         1980 Sets the separator used to split replacement words into multiple choices for
         1981 prompting the user.
         1982 
         1983 B<Default:> C<$>
         1984 
         1985 =item B<comment> <string>
         1986 
         1987 If enabled, anything after C<< <string> >> will be ignored on all lines in
         1988 the input file. This will not be displayed in the
         1989 L<unknown word window|/"UNKNOWN WORD WINDOW"> or L<word choice window|/"WORD CHOICE WINDOW">
         1990 but will still be printed in the end, with the comment character removed
         1991 (that seems to be the most sensible thing to do).
         1992 
         1993 Note that this is really just a "dumb replacement", so there's no way to
         1994 prevent a line with the comment character from being ignored. Just try
         1995 to always set this to a character that does not occur anywhere in the text
         1996 (or don't use the option at all).
         1997 
         1998 =item B<ignore> <filename>
         1999 
         2000 Sets the file of words to ignore.
         2001 
         2002 This has to be set even if the file is just empty because the user can
         2003 add words to it from the unknown word window.
         2004 
         2005 =item B<table> <table identifier> <filename> [nodisplay] [revert]
         2006 
         2007 Load the table from C<< <filename> >>, making it available for later use in the
         2008 B<expand> and B<replace> commands using the identifier C<< <table identifier> >>.
         2009 
         2010 if B<nodisplay> is set, the filename for this table is not shown in the
         2011 L<unknown word window|/"UNKNOWN WORD WINDOW">. If, however, the same filename
         2012 is loaded again for another table that does not have B<nodisplay> set, it is
         2013 still displayed.
         2014 
         2015 If B<revert> is set, the original and replacement words are switched. This can
         2016 be useful for creating a config for transliterating in the opposite direction
         2017 with the same database. I don't know why I called it "revert" since it should
         2018 actually be called "reverse". I guess I was a bit confused.
         2019 
         2020 Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
         2021 to the location of the configuration file.
         2022 
         2023 The table files simply consist of B<tablesep>-separated values, with the word in the
         2024 original script first and the replacement word second. Both the original and
         2025 replacement word can optionally have several parts separated by B<choicesep>. If the
         2026 original word has multiple parts, it is separated and each of the parts is added
         2027 to the table with the replacement. If the replacement has multiple parts, the user
         2028 will be prompted to choose one of the options during the transliteration process.
         2029 If the same word occurs multiple times in the same table with different replacements,
         2030 the replacements are automatically added as choices that will be handled by the
         2031 L<word choice window|/"WORD CHOICE WINDOW">.
         2032 
         2033 If, for whatever reason, the same table is needed twice, but with different endings,
         2034 the table can simply be loaded twice with different IDs. If the same path is loaded,
         2035 the table that has already been loaded will be reused. Note that this feature was
         2036 added before adding B<revert>, so the old table is used even if it had B<revert>
         2037 set and the new one doesn't. This is technically a problem, but I don't know of
         2038 any real-world case where it would be a problem, so I'm too lazy to change it.
         2039 Tell me if it actually becomes a problem for you.
         2040 
         2041 WARNING: Don't load the same table file both with and without B<revert> in the same
         2042 config! When a replacement word is added through the GUI, the program has to know
         2043 which way to write the words. Currently, whenever a table file is loaded with
         2044 B<revert> anywhere in the config (even if it is loaded without B<revert> in a
         2045 different place), words will automatically be written as if B<revert> was on. I
         2046 cannot currently think of any reason why someone would want to load a file both
         2047 with and without B<revert> in the same config, but I still wanted to add this
         2048 warning just in case.
         2049 
         2050 =item B<expand> <table identifier> <word ending table> [noroot]
         2051 
         2052 Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
         2053 the word endings in C<< <word ending table> >>, saving the result as a table with the
         2054 identifier C<< <new table identifier> >>.
         2055 
         2056 Note: There used to be a C<< <new table identifier> >> argument to create a new
         2057 table in case one table had to be expanded with different endings. This has been
         2058 removed because it was a bit ugly, especially since there wasn't a proper mapping
         2059 from table IDs to filenames anymore. If this functionality is needed, the same table
         2060 file can simply be loaded multiple times. See the B<table> section above.
         2061 
         2062 If B<noroot> is set, the root forms of the words are not kept.
         2063 
         2064 If the replacement for a word ending contains B<choicesep>, it is split and each part
         2065 is combined with the root form separately and the user is prompted to choose one of
         2066 the options later. it is thus possible to allow multiple choices for the ending if
         2067 there is a distinction in the replacement script but not in the source script.
         2068 Note that each of the root words is also split into its choices (if necessary)
         2069 during the expanding, so it is possible to use B<choicesep> in both the endings
         2070 and root words.
         2071 
         2072 =item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
         2073 
         2074 Perform a RegEx match using the given C<< <regex string> >>, replacing it with
         2075 C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
         2076 (e.g. groups) in it. B<beginword> and B<endword> specify whether the match must
         2077 be at the beginning or ending of a word, respectively, using the RegEx specified
         2078 in B<beforeword> and B<afterword>. If B<nofinal> is set, the string is not marked
         2079 as transliterated after the replacement, allowing it to be modified by subsequent
         2080 B<match> or B<replace> commands.
         2081 
         2082 =item B<matchignore> <regex string> [beginword] [endword]
         2083 
         2084 Performs a RegEx match in the same manner as B<match>, except that the original
         2085 match is used as the replacement instead of specifying a replacement string, i.e.
         2086 whatever is matched is just marked as transliterated without changing it.
         2087 
         2088 =item B<group> [beginword] [endword]
         2089 
         2090 Begins a replacement group. All B<replace> commands must occur between B<group>
         2091 and B<endgroup>, since they are then grouped together and replaced in one go.
         2092 B<beginword> and B<endword> act in the same way as specified for B<match> and
         2093 apply to all B<replace> statements in this group.
         2094 
         2095 =item B<replace> <table identifier> [override]
         2096 
         2097 Replace all words in the table with the identifier C<< <table identifier> >>,
         2098 using the B<beginword> and B<endword> settings specified by the current group.
         2099 
         2100 Unless B<override> is set on the latter table, if the same word occurs in two
         2101 tables with different replacements, both are automatically added as choices.
         2102 See L</"WORD CHOICE WINDOW">.
         2103 
         2104 B<override> can be useful if the same database is used for both directions and
         2105 one direction maps multiple words to one word, but in the other direction this
         2106 word should always default to one of the choices. In that case, a small table
         2107 with these special cases can be created and put at the end of the main B<group>
         2108 statement with B<override> set. This is technically redundant since you could
         2109 just add a special group with only the override table in it earlier in the
         2110 config, but it somehow seems cleaner this way.
         2111 
         2112 Note that a table must have been loaded before being used in a B<replace> statement.
         2113 
         2114 =item B<endgroup>
         2115 
         2116 End a replacement group.
         2117 
         2118 =item B<retrywithout> <display name> [character] [...]
         2119 
         2120 Adds a button to the L<unknown word window|/"UNKNOWN WORD WINDOW"> to retry the
         2121 replacements on the selected word, first removing the given characters.
         2122 The button is named "<display name>" and located after the "Retry without" label.
         2123 Whatever is found with the replacements is pasted into the regular text box for
         2124 the "Add replacement" functionality.
         2125 
         2126 This can be used as an aid when, for instance, words can be written with or without
         2127 certain diacritics. If the actual word without diacritics is already in the
         2128 database and there is a B<retrywithout> statement for all the diacritics, the
         2129 button can be used to quickly find the replacement for the word instead of having
         2130 to type it out manually. The same goes for compound words that can be written
         2131 with or without a space.
         2132 
         2133 It is also possible to specify B<retrywithout> without any characters, which just
         2134 adds a button that takes whatever word is selected and retries the replacements
         2135 on it. This can be useful if you want to manually edit words and quickly see if
         2136 they are found with the edits in place.
         2137 
         2138 Note that all input text is first normalized to the unicode canonical decomposition
         2139 form so that diacritics can be removed individually.
         2140 
         2141 Also note that all buttons are currently just dumped in the GUI without any
         2142 sort of wrapping, so they'll run off the screen if there are too many.
         2143 Tell me if this becomes a problem. I'm just too lazy to change it right now.
         2144 
         2145 Small warning: This only removes the given characters from the word selected in
         2146 the GUI, not from the tables. Thus, this only works if the version of the word
         2147 without any of the characters is already present in the tables. It would be useful
         2148 when handling diacritics if the program could simply make a comparison while
         2149 completely ignoring diacritics, but I haven't figured out a nice way to implement
         2150 that yet.
         2151 
         2152 Historical note: This was called B<diacritics> in a previous version and only
         2153 allowed removal of diacritics. This is exactly the same functionality, just
         2154 generalized to allow removal of any characters with different buttons.
         2155 
         2156 =item B<targetdiacritics> <diacritic> [...]
         2157 
         2158 This was only added to simplify transliteration from Hindi to Urdu with the
         2159 same database. When this is set, the choices in the
         2160 L<word choice window|/"WORD CHOICE WINDOW"> are sorted in descending order
         2161 based on the number of diacritics from this list that are matched in each
         2162 choice. This is so that when transliterating from Hindi to Urdu, the choice
         2163 with the most diacritics is always at the top.
         2164 
         2165 Additionally, if there are I<exactly> two choices for a word and one of
         2166 them contains diacritics but the other one doesn't, the one containing
         2167 diacritics is automatically taken without ever prompting the user. This
         2168 is, admittedly, a very language-specific feature, but I couldn't think of
         2169 a simple way of adding it without building it directly into the actual program.
         2170 
         2171 Note that due to the way this is implemented, it will not take any effect
         2172 if B<--nochoices> is enabled.
         2173 
         2174 The attentive reader will notice at this point that most of the features
         2175 in this program were added specifically for dealing with Urdu and Hindi,
         2176 which does appear to make sense, considering that this program was written
         2177 specifically for transliterating Urdu to Hindi and vice versa (although
         2178 not quite as much vice versa).
         2179 
         2180 =back
         2181 
         2182 =head1 BUGS
         2183 
         2184 Although it may not seem like it, one of the ugliest parts of the program is the
         2185 GUI functionality that allows the user to add a replacement word. The problem is
         2186 that all information about the B<expand> and B<replace> statements has to be kept
         2187 in order to properly handle adding a word to one of the files and simultaneously
         2188 adding it to the currently loaded tables I<without reloading the entire config>.
         2189 The way it currently works, the replacement word is directly written to the file,
         2190 then all B<expand> statements that would have impacted the words from this file
         2191 are redone (just for the newly added word) and the resulting words are added to
         2192 the appropriate tables (or, technically, the appropriate 'trie'). Since a file
         2193 can be mapped to multiple table IDs and a table ID can occur in multiple replace
         2194 statements, this is more complicated than it sounds, and thus it is very likely
         2195 that there are bugs lurking here somewhere. Do note that "Reload config" will
         2196 B<always> reload the entire configuration, so that's safe to do even if the
         2197 on-the-fly replacing doesn't work.
         2198 
         2199 In general, I have tested the GUI code much less than the rest since you can't
         2200 really test it automatically very well.
         2201 
         2202 The code is generally quite nasty, especially the parts belonging to the GUI.
         2203 Don't look at it.
         2204 
         2205 Tell me if you find any bugs.
         2206 
         2207 =head1 SEE ALSO
         2208 
         2209 perlre, perlretut
         2210 
         2211 =head1 LICENSE
         2212 
         2213 Copyright (c) 2019, 2020, 2021 lumidify <nobody[at]lumidify.org>
         2214 
         2215 Permission to use, copy, modify, and/or distribute this software for any
         2216 purpose with or without fee is hereby granted, provided that the above
         2217 copyright notice and this permission notice appear in all copies.
         2218 
         2219 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
         2220 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
         2221 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
         2222 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
         2223 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
         2224 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
         2225 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
         2226 
         2227 =cut