URI: 
       tCopy data from old repository - transliterate - Transliteration engine
  HTML git clone git://lumidify.org/git/transliterate.git
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
   DIR commit 410d20484d0ceda89cfcad455b727bac02d071d9
  HTML Author: lumidify <nobody@lumidify.org>
       Date:   Thu, 26 Mar 2020 07:37:37 +0100
       
       Copy data from old repository
       
       Diffstat:
         A .gitignore                          |       1 +
         A README                              |       5 +++++
         A tests/alltests.sh                   |       4 ++++
         A tests/runtest.sh                    |      13 +++++++++++++
         A tests/test1/config                  |      15 +++++++++++++++
         A tests/test1/descr.txt               |       1 +
         A tests/test1/err.txt                 |       6 ++++++
         A tests/test1/expected.txt            |       6 ++++++
         A tests/test1/input.txt               |       6 ++++++
         A tests/test2/config                  |      19 +++++++++++++++++++
         A tests/test2/descr.txt               |       1 +
         A tests/test2/err.txt                 |       3 +++
         A tests/test2/expected.txt            |       2 ++
         A tests/test2/input.txt               |       2 ++
         A transliterate.pl                    |    1605 +++++++++++++++++++++++++++++++
       
       15 files changed, 1689 insertions(+), 0 deletions(-)
       ---
   DIR diff --git a/.gitignore b/.gitignore
       t@@ -0,0 +1 @@
       +data
   DIR diff --git a/README b/README
       t@@ -0,0 +1,5 @@
       +See the perldoc in transliterate.pl for documentation (run perldoc -F transliterate.pl).
       +
       +The git history on this repository is not complete because the original
       +repository contained a lot of private information that I didn't want to
       +make available here.
   DIR diff --git a/tests/alltests.sh b/tests/alltests.sh
       t@@ -0,0 +1,4 @@
       +#!/bin/sh
       +
       +./runtest.sh test1
       +./runtest.sh test2
   DIR diff --git a/tests/runtest.sh b/tests/runtest.sh
       t@@ -0,0 +1,13 @@
       +#!/bin/sh
       +
       +../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1
       +diff "$1/expected.txt" runtest.txt > /dev/null
       +err1=$?
       +diff "$1/err.txt" runtest_err.txt > /dev/null
       +if [ $? -eq 0 -a $err1 -eq 0 ]; then
       +  echo "OK `cat $1/descr.txt`"
       +else
       +  echo "FAILED `cat $1/descr.txt`"
       +fi
       +rm runtest.txt
       +rm runtest_err.txt
   DIR diff --git a/tests/test1/config b/tests/test1/config
       t@@ -0,0 +1,15 @@
       +split "[ \n]"
       +beforeword " "
       +afterword "[ \n]"
       +
       +ignore "../data/ignore.txt"
       +table words "../data/words.txt"
       +table endings "../data/endings.txt"
       +
       +expand words same endings
       +
       +match "\d+" "num_replaced" beginword
       +
       +group beginword endword
       +replace words
       +endgroup
   DIR diff --git a/tests/test1/descr.txt b/tests/test1/descr.txt
       t@@ -0,0 +1 @@
       +Basic test
   DIR diff --git a/tests/test1/err.txt b/tests/test1/err.txt
       t@@ -0,0 +1,6 @@
       +Unknown word: "word20"
       +Unknown word: "word01231"
       +Word "word0_replaced$word0_replaced2" with 2 word choices.
       +Unknown word: "aword1"
       +Unknown word: "end3"
       +Word "word0_replacedend3_replaced$word0_replaced2end3_replaced" with 2 word choices.
   DIR diff --git a/tests/test1/expected.txt b/tests/test1/expected.txt
       t@@ -0,0 +1,6 @@
       +word1_replaced word2_replaced
       +num_replacedword1_replaced       word9_replaced   num_replaced   word4_replaced
       +word20 word01231    word0_replaced$word0_replaced2
       +aword1
       +word1_replacedend1_replaced  word0_replacedend3_replaced$word0_replaced2end3_replaced   end3
       +num_replacedword2_replacedend1_replaced
   DIR diff --git a/tests/test1/input.txt b/tests/test1/input.txt
       t@@ -0,0 +1,6 @@
       +word1 word2
       +123word1       word9   123   word4
       +word20 word01231    word0
       +aword1
       +word1end1  word0end3   end3
       +432word2end1
   DIR diff --git a/tests/test2/config b/tests/test2/config
       t@@ -0,0 +1,19 @@
       +split "[ \n]+"
       +beforeword " "
       +afterword "[ \n]"
       +tablesep ","
       +choicesep "|"
       +
       +ignore "../data/ignore.txt"
       +table words "../data/words1.txt"
       +table endings "../data/endings_choices.txt"
       +
       +expand words other_name endings
       +
       +match "-\d" "-r" endword
       +match "w ord" "word" beginword nofinal
       +matchignore "\dhi\d"
       +
       +group beginword endword
       +replace other_name
       +endgroup
   DIR diff --git a/tests/test2/descr.txt b/tests/test2/descr.txt
       t@@ -0,0 +1 @@
       +Different tablesep and linesep; match nofinal; ending choices
   DIR diff --git a/tests/test2/err.txt b/tests/test2/err.txt
       t@@ -0,0 +1,3 @@
       +Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices.
       +Unknown word: "4"
       +Word "word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2" with 4 word choices.
   DIR diff --git a/tests/test2/expected.txt b/tests/test2/expected.txt
       t@@ -0,0 +1,2 @@
       +5hi3 word9_replaced-r   word1_replacedend1r1|word1_replacedend1r2-r
       +45hi1 word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2
   DIR diff --git a/tests/test2/input.txt b/tests/test2/input.txt
       t@@ -0,0 +1,2 @@
       +5hi3 w ord9-0   w ord1end1-1
       +45hi1 word0end1
   DIR diff --git a/transliterate.pl b/transliterate.pl
       t@@ -0,0 +1,1605 @@
       +#!/usr/bin/env perl
       +
       +# Proudly written using vi (OpenBSD nvi)
       +
       +# NOTE: If you're wondering why the error codes used by the functions are so
       +# inconsistent, go ask my former self
       +
       +use strict;
       +use warnings;
       +use utf8;
       +use feature 'unicode_strings';
       +use open qw< :encoding(UTF-8) >;
       +binmode STDOUT, ":utf8";
       +binmode STDERR, ":utf8";
       +use Unicode::Normalize;
       +use Glib qw/TRUE FALSE/;
       +use Gtk2 '-init';
       +use Getopt::Long;
       +use Pod::Usage;
       +use Scalar::Util qw(weaken);
       +use File::Basename qw(dirname);
       +use File::Spec::Functions qw(rel2abs file_name_is_absolute);
       +
       +# takes a string of words separated by '$' and returns a new string in the
       +# same format with duplicates removed
       +sub get_unique_words {
       +        my ($word, $config) = @_;
       +        my %tmp;
       +        my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
       +        return join $config->{choicesep}, @words_uniq;
       +}
       +
       +# Adds all words in $words to $trie
       +# Automatically combines duplicate words with "$" inbetween
       +sub add_to_trie {
       +        my ($table_name, $trie, $words, $args, $config) = @_;
       +        foreach my $word (keys %$words) {
       +                my $cur_node = $trie;
       +                foreach my $char (split //, $word) {
       +                        if (!exists($cur_node->{$char})) {
       +                                $cur_node->{$char}->{"parent"} = $cur_node;
       +                                # This is required to avoid circular references
       +                                # (otherwise, the garbage collector doesn't ever
       +                                # destroy these nodes, leading to the memory
       +                                # consumption growing without restraint if
       +                                # "Reload config" is used)
       +                                weaken($cur_node->{$char}->{"parent"});
       +                        }
       +                        $cur_node = $cur_node->{$char};
       +                }
       +                if (exists($cur_node->{"final"})) {
       +                        if ($args->{"checkduplicates"}) {
       +                                warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
       +                                        "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
       +                                        "current occurrence as \"$words->{$word}\" in " .
       +                                        "table \"$table_name\.\n";
       +                        }
       +                        $cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
       +                } else {
       +                        $cur_node->{"final"} = $words->{$word};
       +                        if ($args->{"checkduplicates"}) {
       +                                $cur_node->{"table_name"} = $table_name;
       +                        }
       +                }
       +        }
       +}
       +
       +# Prompt user when no replacement has been found for a word
       +# $word is the word that was not found and $context* the context,
       +# with context*_orig being the original, non-transliterated context.
       +# $table_paths is a mapping of table paths (here, only the keys, i.e.
       +# the actual paths, are used) to allow the user to choose a table to
       +# save a new replacement to.
       +# $cur_lineno is a display string to show the current line number
       +# $config_error is an optional flag to specify whether an error
       +# message should be displayed, informing the user that the config
       +# could not be loaded (used when "Reload config" is clicked)
       +# Returns: an array reference containing an action to be taken,
       +# in the form ["<action name>", <optional args>].
       +# See `handle_unknown_word_action` for currently accepted values
       +sub prompt_unknown_word {
       +        my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_;
       +        my $action;
       +        my $stop = 0;
       +
       +        my $window = Gtk2::Window->new('toplevel');
       +        $window->signal_connect(delete_event => sub {return FALSE});
       +        $window->signal_connect(destroy => sub { Gtk2->main_quit; });
       +        $window->set_border_width(10);
       +
       +        my $vbox = Gtk2::VBox->new(FALSE, 0);
       +
       +        my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
       +        $vbox->pack_start($linelabel, FALSE, FALSE, 10);
       +        $linelabel->show;
       +
       +        my $wordlabel = Gtk2::Label->new("Word not found: $word");
       +        $wordlabel->set_alignment(0.0, 0.0);
       +        $vbox->pack_start($wordlabel, FALSE, FALSE, 10);
       +        $wordlabel->show;
       +
       +        # Make a text box with the given left and right context and label
       +        # Also creates a button allowing the user to set the currently
       +        # selected text as the word to be replaced - useful when only part
       +        # of the entire word that was not found has to be replaced
       +        my $make_context_box = sub {
       +                my ($ctxtl, $ctxtr, $lbl) = @_;
       +                my $hbox = Gtk2::HBox->new(FALSE, 0);
       +                my $label = Gtk2::Label->new($lbl);
       +                my $text = Gtk2::TextView->new;
       +                my $buffer = $text->get_buffer();
       +                $buffer->set_text($ctxtr);
       +                my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
       +                my $start = $buffer->get_start_iter();
       +                $buffer->insert_with_tags($start, $word, $highlight);
       +                $start = $buffer->get_start_iter();
       +                $buffer->insert($start, $ctxtl);
       +                my $button = Gtk2::Button->new("Use selection as word");
       +                $button->signal_connect(
       +                        clicked => sub {
       +                                if (my ($start, $end) = $buffer->get_selection_bounds()) {
       +                                        $word = $buffer->get_text($start, $end, FALSE);
       +                                        $wordlabel->set_text("Selected: $word");
       +                                }
       +                        }, $window);
       +                $hbox->pack_start($label, FALSE, FALSE, 0);
       +                $hbox->pack_start($text, FALSE, FALSE, 10);
       +                $label->show;
       +                $text->show;
       +                $vbox->pack_start($hbox, FALSE, FALSE, 10);
       +                $hbox->show;
       +                $hbox = Gtk2::HBox->new(FALSE, 0);
       +                $hbox->pack_start($button, FALSE, FALSE, 0);
       +                $button->show;
       +                $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +                $hbox->show;
       +        };
       +        $make_context_box->($contextl, $contextr, "Context: ");
       +        $make_context_box->($contextl_orig, $contextr_orig, "Original: ");
       +
       +        my $hbox = Gtk2::HBox->new(FALSE, 0);
       +        my $label = Gtk2::Label->new("Ignore: ");
       +        $hbox->pack_start($label, FALSE, FALSE, 0);
       +        $label->show;
       +        my $button = Gtk2::Button->new("This run");
       +        $button->signal_connect(
       +                clicked => sub {
       +                        $action = ["ignore", "run", $word];
       +                        $window->destroy;
       +                }, $window);
       +        $hbox->pack_start($button, FALSE, FALSE, 0);
       +        $button->show;
       +        $button = Gtk2::Button->new("Permanently");
       +        $button->signal_connect(
       +                clicked => sub {
       +                        $action = ["ignore", "permanent", $word];
       +                        $window->destroy;
       +                }, $window);
       +        $hbox->pack_start($button, FALSE, FALSE, 0);
       +        $button->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 10);
       +        $hbox->show;
       +
       +        $hbox = Gtk2::HBox->new(FALSE, 0);
       +        $label = Gtk2::Label->new("Add to list: ");
       +        $hbox->pack_start($label, FALSE, FALSE, 0);
       +        $label->show;
       +        my $path_list = Gtk2::ComboBox->new_text;
       +        foreach my $path (keys %$table_paths) {
       +                $path_list->append_text($path);
       +        }
       +        $hbox->pack_start($path_list, FALSE, FALSE, 0);
       +        $path_list->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 10);
       +        $hbox->show;
       +
       +        $hbox = Gtk2::HBox->new(FALSE, 0);
       +        $label = Gtk2::Label->new("Replacement: ");
       +        $hbox->pack_start($label, FALSE, FALSE, 0);
       +        $label->show;
       +        my $replace_entry = Gtk2::Entry->new;
       +        $hbox->pack_start($replace_entry, FALSE, FALSE, 0);
       +        $replace_entry->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +        $hbox->show;
       +
       +        $hbox = Gtk2::HBox->new(FALSE, 0);
       +        $button = Gtk2::Button->new("Add replacement");
       +        $button->signal_connect(
       +                clicked => sub {
       +                        if ($path_list->get_active != -1) {
       +                                $action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
       +                                $window->destroy;
       +                        }
       +                }, $window);
       +        $hbox->pack_start($button, FALSE, FALSE, 0);
       +        $button->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +        $hbox->show;
       +
       +        $hbox = Gtk2::HBox->new(FALSE, 0);
       +        $button = Gtk2::Button->new("Stop processing");
       +        $button->signal_connect(
       +                clicked => sub {
       +                        $stop = 1;
       +                        $window->destroy;
       +                }, $window);
       +        $hbox->pack_start($button, FALSE, FALSE, 0);
       +        $button->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +        $hbox->show;
       +
       +        $hbox = Gtk2::HBox->new(FALSE, 0);
       +        $button = Gtk2::Button->new("Reload config");
       +        $button->signal_connect(
       +                clicked => sub {
       +                        $action = ["reload"];
       +                        $window->destroy;
       +                }, $window);
       +        $hbox->pack_start($button, FALSE, FALSE, 0);
       +        $button->show;
       +
       +        if ($config_error) {
       +                $label = Gtk2::Label->new("Error loading config; see terminal output for details");
       +                $hbox->pack_start($label, FALSE, FALSE, 0);
       +                $label->show;
       +        }
       +        $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +        $hbox->show;
       +
       +        $window->add($vbox);
       +        $vbox->show;
       +        $window->show;
       +        Gtk2->main;
       +
       +        if ($stop) {
       +                die "Processing stopped at line $cur_lineno";
       +        }
       +        if (!$action) {
       +                # This action isn't actually handled - it doesn't make much sense,
       +                # but at least nothing breaks when the window is closed without
       +                # selecting an action
       +                $action = ["ignore", "once", $word];
       +        }
       +        return $action;
       +}
       +
       +# Prompt the user when a word has multiple replacement options (separated by $)
       +# $cur_lineno - display string to show the current line number
       +sub prompt_choose_word {
       +        my ($substrings, $cur_lineno, $config) = @_;
       +
       +        # make a list of all substrings that contain multiple word options
       +        my @replacements;
       +        foreach (0..$#$substrings) {
       +                if ($substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
       +                        # Format of the elements in @replacements:
       +                        # [<id of substrings in $substrings>, <replacement word>, <original string>]
       +                        push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
       +                }
       +        }
       +        # no substrings have multiple options
       +        return if (!@replacements);
       +
       +        my $stop = 0;
       +        my $cur_replacement = 0;
       +
       +        my $window = Gtk2::Window->new('toplevel');
       +        $window->signal_connect(delete_event => sub {return FALSE});
       +        $window->signal_connect(destroy => sub { Gtk2->main_quit; });
       +        $window->set_border_width(10);
       +
       +        my $vbox = Gtk2::VBox->new(FALSE, 0);
       +
       +        my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
       +        $vbox->pack_start($linelabel, FALSE, FALSE, 0);
       +        $linelabel->show;
       +
       +        my $wordlabel = Gtk2::Label->new("");
       +        $wordlabel->set_alignment(0.0, 0.0);
       +        $vbox->pack_start($wordlabel, FALSE, FALSE, 0);
       +        $wordlabel->show;
       +
       +        my $undo = Gtk2::Button->new("Undo");
       +        $vbox->pack_start($undo, FALSE, FALSE, 0);
       +        $undo->show;
       +        $undo->set_sensitive(FALSE);
       +
       +        my $button_vbox = Gtk2::VBox->new(FALSE, 0);
       +        $vbox->pack_start($button_vbox, FALSE, FALSE, 0);
       +        $button_vbox->show;
       +
       +        my $accept = Gtk2::Button->new("Accept changes?");
       +        $vbox->pack_start($accept, FALSE, FALSE, 0);
       +
       +        my $hbox = Gtk2::HBox->new(FALSE, 0);
       +        my $label = Gtk2::Label->new("Context: ");
       +        my $text = Gtk2::TextView->new;
       +        my $buffer = $text->get_buffer();
       +        my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
       +        $text->set_editable(FALSE);
       +        $hbox->pack_start($label, FALSE, FALSE, 0);
       +        $hbox->pack_start($text, FALSE, FALSE, 10);
       +        $label->show;
       +        $text->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 10);
       +        $hbox->show;
       +
       +        $hbox = Gtk2::HBox->new(FALSE, 0);
       +        my $stop_button = Gtk2::Button->new("Stop processing");
       +        $hbox->pack_start($stop_button, FALSE, FALSE, 0);
       +        $stop_button->show;
       +        $vbox->pack_start($hbox, FALSE, FALSE, 0);
       +        $hbox->show;
       +
       +        $window->add($vbox);
       +        $vbox->show;
       +        $window->show;
       +
       +        # generate the context to the left and to the right of the current word being replaced
       +        my $get_context = sub {
       +                my ($contextl, $contextr) = ("", "");
       +                my $tmp_replacement = 0;
       +                foreach (0..$#$substrings) {
       +                        my $word = $substrings->[$_]->[1];
       +                        if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
       +                                $word = $replacements[$tmp_replacement]->[1];
       +                                $tmp_replacement++;
       +                        }
       +                        # When nothing is left to replace, the entire string is in $contextl
       +                        if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
       +                                $contextl .= $word;
       +                        } elsif ($_ > $replacements[$cur_replacement]->[0]) {
       +                                $contextr .= $word;
       +                        }
       +                }
       +                return ($contextl, $contextr);
       +        };
       +
       +        # fill the text buffer with the context and current word, highlighting the word
       +        # if $cur_replacement is after the end of @replacements, don't highlight anythiing
       +        # (this happens when all words have been replaced and the user only needs to accept the changes)
       +        my $fill_text_buffer = sub {
       +                my $start = $buffer->get_start_iter();
       +                my $end = $buffer->get_end_iter();
       +                $buffer->delete($start, $end);
       +                my ($contextl, $contextr) = $get_context->();
       +                $buffer->set_text($contextr);
       +                if ($cur_replacement <= $#replacements) {
       +                        $start = $buffer->get_start_iter();
       +                        $buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
       +                }
       +                $start = $buffer->get_start_iter();
       +                $buffer->insert($start, $contextl);
       +        };
       +
       +        # fill $button_vbox with the word options for the current word
       +        my $fill_button_vbox;
       +        $fill_button_vbox = sub {
       +                $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
       +                my $word = $replacements[$cur_replacement]->[1];
       +                $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
       +                my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
       +                foreach my $word_choice (@choices) {
       +                        my $button = Gtk2::Button->new($word_choice);
       +                        $button->signal_connect(
       +                                clicked => sub {
       +                                        $replacements[$cur_replacement]->[1] = $word_choice;
       +                                        $undo->set_sensitive(TRUE);
       +                                        $cur_replacement++;
       +                                        $fill_text_buffer->();
       +                                        if ($cur_replacement > $#replacements) {
       +                                                $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
       +                                                $accept->show;
       +                                                $accept->grab_focus;
       +                                                $wordlabel->set_text("");
       +                                                return;
       +                                        }
       +                                        $fill_button_vbox->();
       +                                }, $window);
       +                        $button_vbox->pack_start($button, FALSE, FALSE, 0);
       +                        $button->show;
       +                }
       +        };
       +
       +        $undo->signal_connect(
       +                clicked => sub {
       +                        if ($cur_replacement > 0) {
       +                                $cur_replacement--;
       +                                if ($cur_replacement == 0) {
       +                                        $undo->set_sensitive(FALSE);
       +                                }
       +                                $replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
       +                                $fill_button_vbox->();
       +                                $fill_text_buffer->();
       +                                $accept->hide;
       +                                my $word = $replacements[$cur_replacement]->[1];
       +                                $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
       +                        }
       +                }, $window);
       +
       +        $accept->signal_connect(
       +                clicked => sub {
       +                        # write the changes to the original $substrings
       +                        foreach (@replacements) {
       +                                $substrings->[$_->[0]]->[1] = $_->[1];
       +                        }
       +                        $window->destroy;
       +                }, $window);
       +
       +        $stop_button->signal_connect(
       +                clicked => sub {
       +                        $stop = 1;
       +                        $window->destroy;
       +                }, $window);
       +
       +        $fill_button_vbox->();
       +        $fill_text_buffer->();
       +
       +        Gtk2->main;
       +        if ($stop) {
       +                die "Processing stopped at line $cur_lineno";
       +        }
       +}
       +
       +my $ID = 0;
       +my $STRING = 1;
       +
       +# Parse the configuration file into data type (currently only ID and STRING)
       +sub parse_config {
       +        my $f = shift;
       +        my $fh;
       +        if (!open($fh, "<", $f)) {
       +                warn "Can't open config file \"$f\"!\n";
       +                return undef;
       +        }
       +        my $line;
       +        my @commands;
       +        my $state = 0;
       +        my $IN_ID = 1;
       +        my $IN_STR = 2;
       +        my $cur_val = "";
       +        while ($line = <$fh>) {
       +                chomp($line);
       +                $state = 0;
       +                push(@commands, []);
       +                foreach my $char (split(//, $line)) {
       +                        if ($char eq "#") {
       +                                last;
       +                        } elsif ($char eq '"') {
       +                                if ($state & $IN_STR) {
       +                                        push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
       +                                        $cur_val = "";
       +                                        $state &= ~$IN_STR;
       +                                } else {
       +                                        $cur_val = "";
       +                                        $state |= $IN_STR;
       +                                }
       +                        } elsif ($char eq " ") {
       +                                if ($state & $IN_ID) {
       +                                        push(@{$commands[-1]}, {type => $ID, value => $cur_val});
       +                                        $state &= ~$IN_ID;
       +                                        $cur_val = "";
       +                                } elsif ($state) {
       +                                        $cur_val .= $char;
       +                                }
       +                        } else {
       +                                if (!$state) {
       +                                        $state |= $IN_ID;
       +                                }
       +                                $cur_val .= $char;
       +                        }
       +                }
       +                if ($state & $IN_STR) {
       +                        warn "ERROR: Unterminated string in config:\n$line";
       +                        return undef;
       +                } elsif ($cur_val) {
       +                        push(@{$commands[-1]}, {type => $ID, value => $cur_val});
       +                        $cur_val = "";
       +                }
       +                # FIXME: check if this works
       +                if ($#{$commands[-1]} == -1) {
       +                        pop(@commands);
       +                }
       +        }
       +        close($fh);
       +
       +        return \@commands;
       +}
       +
       +# Load a file of replacement words into a hash table
       +sub load_table {
       +        my ($filename, $args, $config) = @_;
       +        my $fh;
       +        # if the paths are relative, find their absolute location based
       +        # on the location of the config file
       +        if (!file_name_is_absolute $filename) {
       +                my $config_dir = dirname $args->{"config"};
       +                $filename = rel2abs($filename, $config_dir);
       +        }
       +        if (!open($fh, "<", $filename)) {
       +                warn "Can't open table file \"$filename\"!\n";
       +                return undef;
       +        }
       +        my $line;
       +        my @words;
       +        my %table;
       +        while ($line = <$fh>) {
       +                chomp $line;
       +                next if (!$line);
       +                @words = split(/\Q$config->{tablesep}\E/, $line);
       +                if (@words != 2) {
       +                        warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
       +                        close $fh;
       +                        return undef;
       +                }
       +                my $key = NFD($words[0]);
       +                if (exists $table{$key}) {
       +                        if ($args->{"checkduplicates"}) {
       +                                warn "WARNING: Duplicate word in file \"$filename\": " .
       +                                        "\"$key\", with replacement \"$words[1]\", " .
       +                                        "already exists with replacement \"$table{$key}\".\n";
       +                        }
       +                        $table{$key} = get_unique_words($table{$key} . $config->{choicesep} . $words[1], $config);
       +                } else {
       +                        $table{$key} = $words[1];
       +                }
       +        }
       +        close $fh;
       +        return \%table;
       +}
       +
       +# Load table for words to ignore - only the keys matter, since there is no replacement
       +sub load_ignore_table {
       +        my ($filename, $args) = @_;
       +        my $line;
       +        my %table;
       +        if (!file_name_is_absolute $filename) {
       +                my $config_dir = dirname $args->{"config"};
       +                $filename = rel2abs($filename, $config_dir);
       +        }
       +        my $fh;
       +        if (!open($fh, "<", $filename)) {
       +                warn "Can't open ignore file \"$filename\"!\n";
       +                return undef;;
       +        }
       +        while ($line = <$fh>) {
       +                chomp $line;
       +                if ($line) {
       +                        $table{NFD($line)} = "";
       +                }
       +        }
       +        close($fh);
       +        return \%table;
       +}
       +
       +# Generate all forms of a word by combining it with endings
       +# Returns:
       +# 0 - an error occurred
       +# 1 - everything's fine
       +sub expand_table {
       +        my ($cmd, $tables, $config) = @_;
       +        my $table_name = $cmd->[1]->{"value"};
       +        my $new_table_name = $cmd->[2]->{"value"};
       +        my $forms_name = $cmd->[3]->{"value"};
       +        if ($new_table_name eq "same") {
       +                $new_table_name = $table_name;
       +        }
       +        if (!exists($tables->{$table_name})) {
       +                warn "expand_table: table \"$table_name\" does not exist.\n";
       +                return 0;
       +        }
       +        if (!exists($tables->{$forms_name})) {
       +                warn "expand_table: table \"$forms_name\" does not exist.\n";
       +                return 0;
       +        }
       +        my $table = $tables->{$table_name};
       +        my $forms = $tables->{$forms_name};
       +
       +        my $noroot = 0;
       +        if ($#$cmd >= 4 && $cmd->[4]->{"value"} eq "noroot") {
       +                $noroot = 1;
       +        }
       +
       +        my %new_table;
       +        foreach my $word (keys %$table) {
       +                foreach my $ending (keys %$forms) {
       +                        # Some words and/or endings have multiple options, separated by $config->{choicesep}
       +                        # These must be temporarily separated in order to properly generate the forms
       +                        my @word_options;
       +                        my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
       +                        my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
       +                        foreach my $stem_option (@stem_options) {
       +                                foreach my $ending_option (@ending_options) {
       +                                        push(@word_options, $stem_option . $ending_option);
       +                                }
       +                        }
       +                        $new_table{$word . $ending} = join($config->{choicesep}, @word_options);
       +                }
       +                if (!$noroot) {
       +                        $new_table{$word} = $table->{$word};
       +                }
       +        }
       +        $tables->{$new_table_name} = \%new_table;
       +        return 1;
       +}
       +
       +# Check if the number and types of arguments given to a config command are right
       +# Returns:
       +# 0 - an error occurred
       +# 1 - everything's fine
       +sub check_args {
       +        my ($args, $cmd) = @_;
       +        my $cmd_name = $cmd->[0]->{"value"};
       +        if ($#$cmd - 1 < $#$args) {
       +                my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
       +                foreach my $arg (@{$cmd}[1..$#$cmd]) {
       +                        $err .= " " . $arg->{"value"}
       +                }
       +                warn "$err\n";
       +                return 0;
       +        }
       +        my $arg_num = 0;
       +        while ($arg_num <= $#$args) {
       +                if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
       +                        my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
       +                        $err .= "Expected:";
       +                        foreach my $arg_type (@$args) {
       +                                $err .= " ID" if ($arg_type == $ID);
       +                                $err .= " STRING" if ($arg_type == $STRING);
       +                        }
       +                        $err .= "\nReceived:";
       +                        foreach my $arg (@{$cmd}[1..$#$cmd]) {
       +                                $err .= " ID" if ($arg->{"type"} == $ID);
       +                                $err .= " STRING" if ($arg->{"type"} == $STRING);
       +                        }
       +                        warn "$err\n";
       +                        return 0;
       +                }
       +                $arg_num++;
       +        }
       +        return 1;
       +}
       +
       +# Interpret the config file - load and expand tables, etc.
       +# $config_list - the list returned by parse_config
       +sub interpret_config {
       +        my ($config_list, $args) = @_;
       +        my %tables;
       +        my %config;
       +        $config{"table_paths"} = {};
       +        $config{"ignore"} = "";
       +        $config{"ignore_words"} = {};
       +        $config{"split"} = "\\s";
       +        $config{"beforeword"} = "\\s";
       +        $config{"afterword"} = "\\s";
       +        $config{"tablesep"} = "\t";
       +        $config{"choicesep"} = "\$";
       +        $config{"replacements"} = [];
       +        my %tmp_table_paths;
       +        my %mandatory_args = (
       +                "ignore" => [$STRING],
       +                "table" => [$ID],
       +                "expand" => [$ID, $ID, $ID],
       +                "match" => [$STRING, $STRING],
       +                "matchignore" => [$STRING],
       +                "replace" => [$ID],
       +                "split" => [$STRING],
       +                "beforeword" => [$STRING],
       +                "afterword" => [$STRING],
       +                "tablesep" => [$STRING],
       +                "choicesep" => [$STRING],
       +                "group" => [],
       +                "endgroup" => []
       +        );
       +        my $in_group = 0;
       +        foreach my $cmd (@$config_list) {
       +                # All load statements must be before expand statements
       +                # All expand, beforeword, and afterword statements must be before replace statements
       +                if ($cmd->[0]->{"type"} == $ID) {
       +                        if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
       +                                warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
       +                                return undef;
       +                        }
       +                        if (!check_args($mandatory_args{$cmd->[0]->{"value"}}, $cmd)) {
       +                                return undef;
       +                        }
       +                        if ($cmd->[0]->{"value"} eq "table") {
       +                                my $table = load_table $cmd->[2]->{"value"}, $args, \%config;
       +                                if (!$table) {
       +                                        return undef;
       +                                }
       +                                $tables{$cmd->[1]->{"value"}} = $table;
       +                                $tmp_table_paths{$cmd->[1]->{"value"}} = $cmd->[2]->{"value"};
       +                        } elsif ($cmd->[0]->{"value"} eq "expand") {
       +                                # FIXME: need to handle table paths when a new table name is used for the expansion
       +                                if (!expand_table($cmd, \%tables, \%config)) {
       +                                        return undef;
       +                                }
       +                        } elsif ($cmd->[0]->{"value"} eq "group") {
       +                                if ($in_group) {
       +                                        warn "ERROR: New group started without ending last one in config\n";
       +                                        return undef;
       +                                }
       +                                push @{$config{"replacements"}}, {"type" => "group", "words" => {}};
       +                                for (1..$#$cmd) {
       +                                        $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
       +                                }
       +                                $in_group = 1;
       +                        } elsif ($cmd->[0]->{"value"} eq "endgroup") {
       +                                if (!$in_group) {
       +                                        warn "ERROR: endgroup command called while not in group\n";
       +                                        return undef;
       +                                }
       +                                $in_group = 0;
       +                        } elsif ($cmd->[0]->{"value"} eq "match") {
       +                                if ($in_group) {
       +                                        warn "ERROR: match command is invalid inside group\n";
       +                                        return undef;
       +                                }
       +                                push @{$config{"replacements"}}, {
       +                                        "type" => "match",
       +                                        "search" => NFD($cmd->[1]->{"value"}),
       +                                        "replace" => $cmd->[2]->{"value"}};
       +                                for (3..$#$cmd) {
       +                                        $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
       +                                }
       +                        } elsif ($cmd->[0]->{"value"} eq "matchignore") {
       +                                if ($in_group) {
       +                                        warn "ERROR: matchignore command is invalid inside group\n";
       +                                        return undef;
       +                                }
       +                                push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})};
       +                                for (2..$#$cmd) {
       +                                        $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
       +                                }
       +                        } elsif ($cmd->[0]->{"value"} eq "replace") {
       +                                if (!$in_group) {
       +                                        warn "ERROR: replace command called while not in group\n";
       +                                        return undef;
       +                                }
       +                                my $table = $cmd->[1]->{"value"};
       +                                if (!exists($tables{$table})) {
       +                                        warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
       +                                        return undef;
       +                                }
       +                                if (exists($tmp_table_paths{$table})) {
       +                                        $config{"table_paths"}->{$tmp_table_paths{$table}} = [$#{$config{"replacements"}}, $table];
       +                                }
       +                                # Note: we don't need to check if $table{"choicesep"} was defined
       +                                # here since we can't ever get this far without first having
       +                                # loaded a table anyways
       +                                add_to_trie($table, $config{"replacements"}->[-1]->{"words"}, $tables{$table}, $args, \%config);
       +                        } elsif ($cmd->[0]->{"value"} eq "split") {
       +                                $config{"split"} = $cmd->[1]->{"value"};
       +                        } elsif ($cmd->[0]->{"value"} eq "beforeword") {
       +                                $config{"beforeword"} = $cmd->[1]->{"value"};
       +                        } elsif ($cmd->[0]->{"value"} eq "afterword") {
       +                                $config{"afterword"} = $cmd->[1]->{"value"};
       +                        } elsif ($cmd->[0]->{"value"} eq "tablesep") {
       +                                $config{"tablesep"} = $cmd->[1]->{"value"};
       +                        } elsif ($cmd->[0]->{"value"} eq "choicesep") {
       +                                $config{"choicesep"} = $cmd->[1]->{"value"};
       +                        } elsif ($cmd->[0]->{"value"} eq "ignore") {
       +                                $config{"ignore"} = $cmd->[1]->{"value"};
       +                                my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
       +                                if (!$table) {
       +                                        return undef;
       +                                }
       +                                $config{"ignore_words"} = $table;
       +                        } else {
       +                                warn "ERROR: unknown command \"" . $cmd->[0]->{"value"} . "\" in config.\n";
       +                                return undef;
       +                        }
       +                } else {
       +                        my $err =  "ERROR: line does not start with command:\n";
       +                        foreach my $cmd_part (@$cmd) {
       +                                $err .= $cmd_part->{"value"};
       +                        }
       +                        warn "$err\n";
       +                        return undef;
       +                }
       +        }
       +        if ($in_group) {
       +                warn "ERROR: unclosed group in config\n";
       +                return undef;
       +        }
       +        if (!$config{"ignore"}) {
       +                warn "ERROR: no file of words to ignore specified.\n";
       +                return undef;
       +        }
       +        return \%config;
       +}
       +
       +# load the config file
       +# Returns:
       +# the config hash or undef if an error occurred
       +sub load_config {
       +        my $args = shift;
       +        my $config_list = parse_config($args->{"config"});
       +        if (!$config_list) {
       +                return undef;
       +        }
       +        return interpret_config $config_list, $args;
       +}
       +
       +# Handle the action returned by `prompt_unknown_word`
       +# $config - the current program config
       +# $args - the command line arguments
       +# Currently accepted values for $action:
       +# ["ignore", "run", $word] - only ignore $word for the rest of this run
       +# ["ignore", "permanent", $word] - write $word to the permanent ignore file
       +# ["add", $word, $replace_word, $table_path] - add $word to the table 
       +#         corresponding to $table_path with $replace_word as its replacement. Note that
       +#         only tables directly corresponding to paths work here - tables that only
       +#         were created through "expand" in the config aren't ever shown separately
       +#         in `prompt_unknown_word`
       +# ["reload"] - reload the configuration file
       +# Returns:
       +# 0 - nothing needs to be done
       +# 1 - the current line needs to be re-transliterated with the new config
       +# 2 - an error occurred while reloading the config
       +sub handle_unknown_word_action {
       +        my ($action, $config, $args) = @_;
       +        if ($action->[0] eq "ignore") {
       +                $config->{"ignore_words"}->{$action->[2]} = "";
       +                if ($action->[1] eq "permanent") {
       +                        my $fh;
       +                        if (!open($fh, ">>", $config->{"ignore"})) {
       +                                warn "ERROR: Can't open ignore file for appending.\n";
       +                                return 1;
       +                        }
       +                        print($fh $action->[2] . "\n");
       +                        close($fh);
       +                } elsif ($action->[1] eq "run") {
       +                        # Print to error file if ignore isn't permanent
       +                        return 0 if ($args->{"errors"} eq "");
       +                        my $fh;
       +                        if (!open($fh, ">>", $args->{"errors"})) {
       +                                warn "ERROR: Can't open error file \"$args->{errors}\".\n";
       +                                return 0;
       +                        }
       +                        print($fh $action->[2] . "\n");
       +                        close($fh);
       +                }
       +                return 0;
       +        } elsif ($action->[0] eq "add") {
       +                my $table_path = $action->[3];
       +                my $word = $action->[1];
       +                my $replace_word = $action->[2];
       +                my $fh;
       +                if (!open($fh, ">>", $table_path)) {
       +                        warn "ERROR: Can't open table file \"$table_path\" for appending.\n";
       +                        return 1;
       +                }
       +                print($fh $word . "," . $replace_word . "\n");
       +                close($fh);
       +                my $replacement_id = $config->{"table_paths"}->{$table_path}->[0];
       +                my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
       +                add_to_trie($config->{"table_paths"}->{$table_path}->[1], $trie, {$word => $replace_word});
       +                return 1;
       +        } elsif ($action->[0] eq "reload") {
       +                my $new_config = load_config $args;
       +                if ($new_config) {
       +                        %$config = %$new_config;
       +                        return 1;
       +                } else {
       +                        return 2;
       +                }
       +        }
       +}
       +
       +# Split $substrings into single words based on the "split" option
       +# in $config.
       +# $substrings can already be split at this point; only the
       +# ones that haven't been transliterated yet are modified
       +sub split_words {
       +        my ($config, $substrings) = @_;
       +        my @substrings_new;
       +        my $split_re = qr/($config->{"split"})/;
       +        foreach my $cur_substr (@$substrings) {
       +                if ($cur_substr->[0] == 1) {
       +                        push(@substrings_new, $cur_substr);
       +                        next;
       +                }
       +
       +                my @words = split(/$split_re/, $cur_substr->[1]);
       +                for my $i (0..$#words) {
       +                        # Word is not delimiter
       +                        # Split produces an empty field at the beginning if the string
       +                        # starts with the delimiter
       +                        if ($i % 2 == 0) {
       +                                push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
       +                        } else {
       +                                # Delimiters can count as already replaced
       +                                push(@substrings_new, [1, $words[$i], $words[$i]]);
       +                        }
       +                }
       +        }
       +        @$substrings = @substrings_new;
       +}
       +
       +# small helper function to add a untransliterated string to the last substring
       +# if that is not transliterated yet, or push it onto @$substrings otherwise
       +# -> used to keep all untransliterated text in one piece
       +# since this is also used for the "nofinal" attribute on "match", it takes
       +# an original and replaced string (since, when using "match" and "nofinal",
       +# the original string was replaced, but is still marked as unknown)
       +sub push_unknown {
       +        my ($substrings, $orig, $replaced) = @_;
       +        $replaced //= $orig;
       +        if (@$substrings && !$substrings->[-1]->[0]) {
       +                $substrings->[-1]->[1] .= $replaced;
       +                $substrings->[-1]->[2] .= $orig;
       +        } else {
       +                push(@$substrings, [0, $replaced, $orig]);
       +        }
       +}
       +
       +# Replace a word in $substrings based on $replace_config using regex
       +# $replace_config->{"search"} is the word to replace
       +# $replace_config->{"replace"} is the replacement word
       +#        if $replace_config->{"replace"} is undefined, just splits
       +#        $substrings at the the match and marks that the match has
       +#        been transliterated - currently used for "matchignore"
       +# $replace_config->{"beginword"}, $replace_config->{"afterword"} -
       +#        specifies if the match is only valid when $config->{"beforeword"}
       +#        or $config->{"afterword"} occur before or after it, respectively
       +sub replace_match {
       +        my ($config, $replace_config, $substrings) = @_;
       +        my $beginword = exists $replace_config->{"beginword"};
       +        my $endword = exists $replace_config->{"endword"};
       +        my $fullword = $beginword && $endword;
       +        my $beforeword = $config->{"beforeword"};
       +        my $afterword = $config->{"afterword"};
       +        my $word = $replace_config->{"search"};
       +        my $replace_word = $replace_config->{"replace"};
       +        if ($fullword) {
       +                $word = qr/(\A|$beforeword)$word(\z|$afterword)/;
       +        } elsif ($beginword) {
       +                $word = qr/(\A|$beforeword)$word/;
       +        } elsif ($endword) {
       +                $word = qr/$word(\z|$afterword)/;
       +        } else {
       +                $word = qr/$word/;
       +        }
       +
       +        my @substrings_new;
       +        # only modify $substrings at all if the word was found
       +        my $found_word = 0;
       +        my $last_idx;
       +        # @substrings_new is only used if needed to improve efficiency
       +        foreach my $i (0..$#$substrings) {
       +                if ($substrings->[$i]->[0]) {
       +                        # FIXME: is there a way to make it more efficient by keeping the old array?
       +                        # This is a major bottleneck
       +                        # Note: the above statement *may* be a bit exaggerated
       +                        if ($found_word) {
       +                                push(@substrings_new, $substrings->[$i]);
       +                        }
       +                        next;
       +                }
       +                $last_idx = 0;
       +                my $i0 = 0;
       +                my $i1 = 0;
       +                while ($substrings->[$i]->[1] =~ m/$word/g) {
       +                        if (!$found_word) {
       +                                $found_word = 1;
       +                                if ($i != 0) {
       +                                        push(@substrings_new, @{$substrings}[0..$i-1]);
       +                                }
       +                        }
       +                        # This mess is needed to reliably match $beforeword and $afterword and put the captured
       +                        # "splitting" characters back into the text. This would be much easier just using
       +                        # a lookbehind and lookahead, but I couldn't find a way to also match beginning and
       +                        # end of string that way.
       +                        $i0 = $-[0];
       +                        $i1 = $+[0];
       +                        if ($fullword) {
       +                                $i0 += length($1);
       +                                $i1 -= length($2);
       +                                # pos need to be decreased so that matches still work right next to each other
       +                                pos($substrings->[$i]->[1]) -= length($2);
       +                        } elsif ($beginword) {
       +                                $i0 += length($1);
       +                        } elsif ($endword) {
       +                                $i1 -= length($1);
       +                                pos($substrings->[$i]->[1]) -= length($1);
       +                        }
       +                        if ($last_idx != $i0) {
       +                                my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
       +                                push_unknown \@substrings_new, $unknown;
       +                        }
       +                        my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
       +                        my $replace_str = $replace_word // $orig_str;
       +                        if ($replace_config->{"nofinal"}) {
       +                                push_unknown \@substrings_new, $orig_str, $replace_str;
       +                        } else {
       +                                push(@substrings_new, [1, $replace_str, $orig_str]);
       +                        }
       +                        $last_idx = $i1;
       +                }
       +                if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
       +                        my $unknown = substr($substrings->[$i]->[1], $last_idx);
       +                        push_unknown \@substrings_new, $unknown;
       +                }
       +        }
       +        if ($found_word) {
       +                @$substrings = @substrings_new;
       +        }
       +}
       +
       +# Replace a group, i.e. replace all the words in a trie
       +# $replace_config->{"words"} - the root node of the trie
       +# $replace_config->{"beginword"}, $replace_config->{"endword"} -
       +#        same as in `replace_match`
       +sub replace_group {
       +        my ($config, $replace_config, $substrings) = @_;
       +        my @substrings_new;
       +        # Recurse backwords towards the root node of the trie to find the first
       +        # node with a key "final" which satisfies the ending condition (if "endword" is set)
       +        # Returns the id *after* the last match and the node that was found
       +        # with the key "final" (or undef, if nothing matched)
       +        my $find_final = sub {
       +                my ($i, $tmp_cur_node, $s) = @_;
       +                do {
       +                        my $after_ch = substr($s->[1], $i, 1);
       +                        if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) ||
       +                                $after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
       +                                return ($i, $tmp_cur_node);
       +                        }
       +                        $i--;
       +                } while ($tmp_cur_node = $tmp_cur_node->{"parent"});
       +                # none of the points were appropriate for breaking the word, so
       +                # $tmp_cur_node now points to the nonexistent parent node of the
       +                # root node
       +                return ($i, undef);
       +        };
       +        foreach my $s (@$substrings) {
       +                if ($s->[0]) {
       +                        push(@substrings_new, $s);
       +                        next;
       +                }
       +                my $cur_node = $replace_config->{"words"};
       +                my $start_i = 0;
       +                my $i = 0;
       +                # This deliberately goes off the end of the string! $cur_node is always "one behind" $i
       +                # since the node is only advanced in the iteration *after* $i has increased, meaning that
       +                # $i has to already be after the end of the string for the first if statement to definitely
       +                # fail, causing the elsif statement to handle that case
       +                while ($i <= length($s->[1])) {
       +                        # This works even when $i is one index after the end of the string - it just returns "" then
       +                        my $ch = substr($s->[1], $i, 1);
       +                        if (exists $cur_node->{$ch}) {
       +                                if ($cur_node == $replace_config->{"words"}) {
       +                                        my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
       +                                        if (exists($replace_config->{"beginword"}) &&
       +                                                $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
       +                                                push_unknown \@substrings_new, $ch;
       +                                                $i++;
       +                                                next;
       +                                        }
       +                                        $start_i = $i;
       +                                }
       +                                $cur_node = $cur_node->{$ch};
       +                        } elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
       +                                my $tmp_cur_node = $cur_node;
       +                                ($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
       +                                if (!defined($tmp_cur_node)) {
       +                                        push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
       +                                        $i += 2;
       +                                } else {
       +                                        push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]);
       +                                }
       +                                $cur_node = $replace_config->{"words"};
       +                                next;
       +                        } else {
       +                                push_unknown \@substrings_new, $ch;
       +                        }
       +                        $i++;
       +                }
       +        }
       +        @$substrings = @substrings_new;
       +}
       +
       +# Perform all replacements on $line based on $config
       +# $substrings: array of arrays - each one has three elements:
       +# first 0 or 1, indicating if the substring following it has already
       +# been replaced or not (1 means it has been replaced), then the
       +# transliterated string, and lastly the original string.
       +# If the first element is 0, the second two elements are obviously same
       +sub replace_line {
       +        my ($config, $line) = @_;
       +        my $substrings = [[0, $line, $line]];
       +        foreach my $replacement (@{$config->{"replacements"}}) {
       +                if ($replacement->{"type"} eq "match") {
       +                        replace_match($config, $replacement, $substrings);
       +                } elsif ($replacement->{"type"} eq "group") {
       +                        replace_group($config, $replacement, $substrings);
       +                }
       +        }
       +        # splits all words at the end so that the splitting characters
       +        # aren't taken as unknown words and the unknown words are (hopefully)
       +        # in better chunks for prompting the user about them
       +        split_words($config, $substrings);
       +
       +        return $substrings;
       +}
       +
       +# NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
       +# If an ignore word is added which is attached to a word that should have a replacement
       +# added and just that word is selected to ignore, you never get a chance to add a
       +# replacement for the other word that it is attached to
       +
       +# Handle unknown words
       +# $substrings - the current substrings with unknown words
       +# $config - the program config
       +# $args - the command line args
       +# $cur_lineno - display string to show the user the current line number
       +# Returns:
       +# 1 - the current line must be replaced again with the new config
       +# 0 - everything's fine
       +sub get_unknown_words {
       +        my ($substrings, $config, $args, $cur_lineno) = @_;
       +        foreach my $i (0..$#$substrings) {
       +                my $word = $substrings->[$i];
       +                if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
       +                        my $contextl = "";
       +                        my $contextl_orig = "";
       +                        foreach my $j (0..$i-1) {
       +                                $contextl .= $substrings->[$j]->[1];
       +                                $contextl_orig .= $substrings->[$j]->[2];
       +                        }
       +                        my $contextr = "";
       +                        my $contextr_orig = "";
       +                        foreach my $j ($i+1..$#$substrings) {
       +                                $contextr .= $substrings->[$j]->[1];
       +                                $contextr_orig .= $substrings->[$j]->[2];
       +                        }
       +                        my $action = prompt_unknown_word($contextl, $contextl_orig,
       +                                $word->[1], $contextr, $contextr_orig,
       +                                $config->{"table_paths"}, "$cur_lineno"
       +                        );
       +                        # if $ret == 2, config could not be loaded
       +                        # if $ret == 1, line must be redone with new config
       +                        my $ret = handle_unknown_word_action($action, $config, $args);
       +                        # keep retrying until the user chooses an action which
       +                        # didn't throw an error
       +                        while ($ret == 2) {
       +                                $action = prompt_unknown_word($contextl, $contextl_orig,
       +                                        $word->[1], $contextr, $contextr_orig,
       +                                        $config->{"table_paths"}, "$cur_lineno", 1);
       +                                $ret = handle_unknown_word_action($action, $config, $args);
       +                        }
       +                        # re-transliterate the line with the new config
       +                        if ($ret == 1) {
       +                                return 1;
       +                        }
       +                }
       +        }
       +        return 0;
       +}
       +
       +# Main replacement function
       +# Opens the input file ($args->{"input"}) and writes the transliterated text
       +# to the file handle $outputfh, prompting the user for unknown words or
       +# word choices (if those aren't disabled on the command line)
       +sub replace {
       +        my ($config, $args, $outputfh) = @_;
       +        # Is there *really* no more efficient way to get the total number of lines?
       +        open my $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n";
       +        my $total_lines = 0;
       +        while (<$fh>) {$total_lines++};
       +        close $fh;
       +        open $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n";
       +        while (my $line = <$fh>) {
       +                next if $. < $args->{"start"};
       +                my $nfd_line = NFD($line);
       +                my $substrings = replace_line($config, $nfd_line);
       +
       +                if (!$args->{"nounknowns"}) {
       +                        # re-transliterate the string if the config was reloaded
       +                        while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
       +                                $substrings = replace_line($config, $nfd_line);
       +                        }
       +                } elsif ($args->{"debug"}) {
       +                        foreach my $s (@$substrings) {
       +                                if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
       +                                        warn "Unknown word: \"$s->[1]\"\n";
       +                                }
       +                        }
       +                }
       +                if (!$args->{"nochoices"}) {
       +                        prompt_choose_word($substrings, "$./$total_lines", $config);
       +                } elsif ($args->{"debug"}) {
       +                        foreach my $s (@$substrings) {
       +                                if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
       +                                        my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
       +                                        warn "Word \"$s->[1]\" with $num_choices word choices.\n";
       +                                }
       +                        }
       +                }
       +
       +                foreach (@$substrings) {
       +                        print $outputfh $_->[1];
       +                }
       +        }
       +        close $fh;
       +}
       +
       +my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
       +GetOptions(
       +        \%args, "debug",
       +        "nochoices", "nounknowns",
       +        "force", "start=i",
       +        "output=s", "config=s",
       +        "errors=s", "help",
       +        "checkduplicates") or pod2usage(1);
       +
       +pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
       +pod2usage(1) if $#ARGV != 0 && !$args{"checkduplicates"};
       +
       +if (!-f $args{"config"}) {
       +        die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
       +}
       +my $config = load_config \%args;
       +if (!$config) {
       +        die "ERROR: Invalid config\n";
       +}
       +exit 0 if ($args{"checkduplicates"});
       +
       +my $input = $ARGV[0];
       +if (!-f $input) {
       +        die "ERROR: input file \"$input\" does not exist or is not a file.\n";
       +}
       +$args{"input"} = $input;
       +
       +if (-f $args{"errors"} && !$args{"force"}) {
       +        my $choice = "";
       +        while ($choice !~ /^[yn]$/) {
       +                print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
       +                $choice = <STDIN>;
       +                chomp $choice;
       +        }
       +        die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
       +}
       +
       +my $outputfh;
       +if ($args{"output"} eq "") {
       +        warn "WARNING: no output file supplied; printing to STDOUT\n";
       +        open $outputfh, ">&STDOUT";
       +} elsif (-f $args{"output"} && !$args{"force"}) {
       +        my $choice = "";
       +        while ($choice !~ /^[aoe]$/) {
       +                print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
       +                $choice = <STDIN>;
       +                chomp $choice;
       +        }
       +        if ($choice eq "a") {
       +                open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
       +        } elsif ($choice eq "e") {
       +                die "ERROR: \"$args{output}\" already exists.\n";
       +        } else {
       +                open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
       +        }
       +} else {
       +        open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
       +}
       +
       +replace($config, \%args, $outputfh);
       +close $outputfh;
       +
       +__END__
       +
       +=head1 NAME
       +
       +transliterate.pl - Transliterate text files
       +
       +=head1 SYNOPSIS
       +
       +transliterate.pl [options][input file]
       +
       +Start the transliteration engine with the given file as input.
       +
       +=head1 OPTIONS
       +
       +=over 8
       +
       +=item B<< --output <filename> >>
       +
       +Sets the output file to print to.
       +
       +If the file exists already and C<--force> is not set, the user is asked
       +if the file should be overwritten or appended to.
       +
       +B<Default:> STDOUT (print to terminal)
       +
       +=item B<< --config <filename> >>
       +
       +Sets the configuration file to use.
       +
       +B<Default:> "config"
       +
       +=item B<--checkduplicates>
       +
       +Prints all duplicate words within single table files and across tables
       +that are replaced within the same group, then exits the program.
       +
       +Note that this simply prints B<all> duplicates, even ones that are
       +legitimate. When duplicates are found during normal operation of
       +the program, they are simply combined in exactly the same way as the
       +regular word choices.
       +
       +=item B<--nochoices>
       +
       +Disables prompting for the right word when multiple replacement words exist.
       +
       +This can be used to "weed out" all the unknown words before
       +commencing the laborious task of choosing the right word every time
       +multiple options exist.
       +
       +=item B<--nounknowns>
       +
       +Disables prompting for the right word when a word is not found in the database.
       +
       +This can be used together with B<--nochoices> to perform a quick test of how
       +well the actual engine is working without having to click through all the
       +prompts.
       +
       +=item B<--debug>
       +
       +This option is only useful for automatic testing of the transliteration engine.
       +
       +If C<--nochoices> is enabled, each word in the input with multiple choices will
       +be output, along with the number of choices (can be used to test the proper
       +functioning of C<choicesep> in the config file).
       +
       +If C<--nounknowns> is enabled, each unknown word in the input is printed
       +(can be used to test that the C<ignore> options are working correctly).
       +
       +=item B<--force>
       +
       +Always overwrites the output and error file without asking.
       +
       +=item B<< --start <line number> >>
       +
       +Starts at the given line number instead of the beginning of the file.
       +
       +Note: when "Stop processing" is pressed, the current line number is
       +printed out. This is the current line that was being processed, so it
       +has not been printed to the output file yet and thus the program must
       +be resumed at that line, not the one afterwards.
       +
       +=item B<< --errors <filename> >>
       +
       +Specifies a file to write errors in. Note that this does not refer to
       +actual errors, but to any words that were temporarily ignored
       +(i.e. words for which "Ignore: This run" was clicked).
       +
       +If no file is specified, nothing is written. If a file is specified
       +that already exists and C<--force> is not set, the user is prompted
       +for action.
       +
       +=item B<--help>
       +
       +Display the full documentation.
       +
       +=back
       +
       +=head1 DESCRIPTION
       +
       +B<transliterate.pl> will read the given input file and transliterate it
       +based on the given configuration file, prompting the user for action if
       +a word has multiple replacement options or is not found in the database.
       +
       +See L</"CONFIGURATION"> for details on what is possible.
       +
       +=head1 WORD CHOICE WINDOW
       +
       +The word choice window is opened any time one word has multiple replacement
       +options and prompts the user to choose one.
       +
       +For each word with multiple options, the user must choose the right option
       +and then press "Accept changes" to finalize the transliteration of the
       +current line. Before the line is finalized, the user may press "Undo" to
       +undo any changes on the current line.
       +
       +"Stop processing" will exit the program and print the line number that was
       +currently being processed.
       +
       +=head1 UNKNOWN WORD WINDOW
       +
       +The unknown word window is opened any time a word could not be replaced.
       +
       +Both the context from the original language and the context from the
       +transliterated version (so far) is shown. If a part of the text is
       +selected in one of the text boxes and "Use selection as word" is
       +pressed for the appropriate box, the selected text is used for the
       +action that is taken subsequently.
       +
       +The possible actions are:
       +
       +=over 8
       +
       +=item Ignore
       +
       +"This run" only ignores the word until the program exist, while
       +"Permanently" saves the word in the ignore file specified in the
       +configuration.
       +
       +=item Add to list
       +
       +Adds the word typed in the text box beside "Add replacement" to the
       +selected table file and re-runs the replacement on the current line.
       +
       +The filtering for which table files are actually shown here is
       +currently a bit rudimentary. First, all paths that are used in the
       +C<table> statements in the config are put into a list. Then, the
       +paths corresponding to any tables used for word endings in the
       +C<expand> statements are removed from the list, and that is what
       +is shown in this window. The reason for removing those paths from
       +the list is that it gets somewhat confusing when all the tables
       +that are only used for word endings are also in the list, and it
       +is very unlikely that an unknown word would need to be written to
       +one of those files. If necessary, the word can always be added
       +manually and the config reloaded. Note also that only actual
       +table paths are shown here, not the tables themselves - this is
       +not necessarily a one-to-one mapping since new tables can be
       +generated with the C<expand> statements.
       +
       +=item Reload config
       +
       +Reload the configuration file along with all tables an re-runs the
       +replacement on the current line. Note that this can take a short
       +while since the entire word database has to be reloaded.
       +
       +=item Stop processing
       +
       +Prints the current line number to the terminal and exits the program.
       +
       +=back
       +
       +=head1 CONFIGURATION
       +
       +These are the commands accepted in the configuration file.
       +Any parameters in square brackets are optional.
       +
       +The C<match>, C<matchignore>, and C<replace> commands are executed in
       +the order they are specified, except that all C<replace> commands within
       +the same group are replaced together.
       +
       +Note that any duplicate words found will cause the user to be prompted
       +to choose one option every time the word is replaced in the input text.
       +
       +=over 8
       +
       +=item B<split> <regex string>
       +
       +Sets the RegEx string to be used for splitting words. This is only used
       +for splitting the words which couldn't be replaced after all replacement
       +has been done, before prompting the user for unknown words.
       +
       +Note that C<split> should probably always contain at least C<\n>, since
       +otherwise all of the newlines will be marked as unknown words. Usually,
       +this will be included anyways through C<\s>.
       +
       +Note also that C<split> should probably include the C<+> RegEx-quantifier
       +since that allows the splitting function in the end to ignore several
       +splitting characters right after each other (e.g. several spaces) in one
       +go instead of splitting the string again for every single one of them.
       +This shouldn't actually make any difference functionality-wise, though.
       +
       +B<Default:> \s (all whitespace)
       +
       +=item B<beforeword> <regex string>
       +
       +Sets the RegEx string to be matched before a word if C<beginword> is set.
       +
       +B<Default:> \s
       +
       +=item B<afterword> <regex string>
       +
       +Sets the RegEx string to be matched after a word if C<endword> is set.
       +
       +Note that C<afterword> should probably always contain at least C<\n>,
       +since otherwise words with C<endword> set will not be matched at the
       +end of a line.
       +
       +C<beforeword> and C<afterword> will often be exactly the same, but
       +they are left as separate options in case more fine-tuning is needed.
       +
       +B<Default:> \s
       +
       +=item B<tablesep> <string>
       +
       +Sets the separator used to split the lines in the table files into the
       +original and replacement word.
       +
       +B<Default:> Tab
       +
       +=item B<choicesep> <string>
       +
       +Sets the separator used to split replacement words into multiple choices for
       +prompting the user.
       +
       +B<Default:> $
       +
       +=item B<ignore> <filename>
       +
       +Sets the file of words to ignore.
       +
       +This has to be set even if the file is just empty because the user can
       +add words to it from the unknown word window.
       +
       +=item B<table> <table identifier> <filename>
       +
       +Load the table from C<< <filename> >>, making it available for later use in the
       +C<expand> and C<replace> commands using the identifier C<< <table identifier> >>.
       +
       +Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
       +to the location of the configuration file.
       +
       +The table files simply consist of C<tablesep>-separated values, with the word in the
       +original language first and the replacement word second. The replacement word
       +can optionally have several parts separated by C<choicesep>, which will cause the
       +user to be prompted to choose one of the options.
       +
       +=item B<expand> <table identifier> <new table identifier> <word ending table> [noroot]
       +
       +Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
       +the word endings in C<< <word ending table> >>, saving the result as a table with the
       +identifier C<< <new table identifier> >>. If C<same> is specified as C<< <new table
       +identifier> >>, the original C<< <table identifier> >> is used instead.
       +
       +If C<noroot> is set, the root forms of the words are not kept.
       +
       +If the replacement for a word ending contains C<choicesep>, it is split and each part
       +is combined with the root form separately and the user is prompted to choose one of
       +the options later. it is thus possible to allow multiple choices for the ending if
       +there is a distinction in the replacement language but not in the source language.
       +Note that each of the root words is also split into its choices (if necessary)
       +during the expanding, so it is possible to use C<choicesep> in both the endings
       +and root words.
       +
       +Note that the paths of all tables used for <word ending table> are removed from the
       +list of paths that is later shown in the unknown word window. See L</"UNKNOWN WORD
       +WINDOW"> for details.
       +
       +=item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
       +
       +Perform a RegEx match using the given C<< <regex string> >>, replacing it with
       +C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
       +(e.g. groups) in it. C<beginword> and C<endword> specify whether the match must
       +be at the beginning or ending of a word, respectively, using the RegEx specified
       +in C<beforeword> and C<afterword>. If C<nofinal> is set, the string is not marked
       +as transliterated after the replacement, allowing it to be modified by subsequent
       +C<match> or C<replace> commands.
       +
       +=item B<matchignore> <regex string> [beginword] [endword]
       +
       +Performs a RegEx match in the same manner as C<match>, except that the original
       +match is used as the replacement instead of specifying a replacement string.
       +
       +=item B<group> [beginword] [endword]
       +
       +Begins a replacement group. All C<replace> commands must occur between C<group>
       +and C<endgroup>, since they are then grouped together and replaced in one go.
       +C<beginword> and C<endword> act in the same way as specified for C<match> and
       +apply to all C<replace> statements in this group.
       +
       +=item B<replace> <table identifier>
       +
       +Replace all words in the table with the identifier C<< <table identifier> >>,
       +using the C<beginword> and C<endword> settings specified by the current group.
       +
       +Note that a table must have been loaded (or generated using C<expand>)
       +before being used in a C<replace> statement.
       +
       +=item B<endgroup>
       +
       +End a replacement group.
       +
       +=back
       +
       +=head1 SEE ALSO
       +
       +perlre, perlretut
       +
       +=head1 LICENSE
       +
       +Copyright (c) 2019, 2020 lumidify <nobody[at]lumidify.org>
       +
       +Permission to use, copy, modify, and distribute this software for any
       +purpose with or without fee is hereby granted, provided that the above
       +copyright notice and this permission notice appear in all copies.
       +
       +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
       +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
       +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
       +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
       +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
       +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
       +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
       +
       +=cut