transliterate.pl - transliterate - Transliteration engine
HTML git clone git://lumidify.org/transliterate.git (fast, but not encrypted)
HTML git clone https://lumidify.org/transliterate.git (encrypted, but very slow)
HTML git clone git://4kcetb7mo7hj6grozzybxtotsub5bempzo4lirzc3437amof2c2impyd.onion/transliterate.git (over tor)
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
transliterate.pl (83637B)
---
1 #!/usr/bin/env perl
2
3 # Proudly written using vi (OpenBSD nvi)
4
5 # NOTE: If you're wondering why the error codes used by the functions are so
6 # inconsistent, go ask my former self
7
8 # NOTE 2: This codebase has grown as new features were needed, so it's quite
9 # ugly now, but I haven't had time to clean it up.
10
11 use strict;
12 use warnings;
13 use utf8;
14 use feature 'unicode_strings';
15 use open qw< :encoding(UTF-8) >;
16 binmode STDIN, ":encoding(UTF-8)";
17 binmode STDOUT, ":encoding(UTF-8)";
18 binmode STDERR, ":encoding(UTF-8)";
19 use Unicode::Normalize;
20 use Glib qw/TRUE FALSE/;
21 use Gtk3 '-init';
22 use Getopt::Long;
23 use Pod::Usage;
24 use Scalar::Util qw(weaken);
25 use File::Basename qw(dirname);
26 use File::Spec::Functions qw(rel2abs file_name_is_absolute);
27
28 # takes a string of words separated by '$config->{choicesep}' and returns a new string in the
29 # same format with duplicates removed
30 sub get_unique_words {
31 my ($word, $config) = @_;
32 my %tmp;
33 my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
34 return join $config->{choicesep}, @words_uniq;
35 }
36
37 # Adds all words in $words to $trie
38 # Automatically combines duplicate words with "$config->{choicesep}" inbetween
39 sub add_to_trie {
40 my ($table_name, $trie, $words, $args, $config, $override) = @_;
41 foreach my $word (keys %$words) {
42 my $cur_node = $trie;
43 foreach my $char (split //, $word) {
44 if (!exists($cur_node->{$char})) {
45 $cur_node->{$char}->{"parent"} = $cur_node;
46 # This is required to avoid circular references
47 # (otherwise, the garbage collector doesn't ever
48 # destroy these nodes, leading to the memory
49 # consumption growing without restraint if
50 # "Reload config" is used)
51 weaken($cur_node->{$char}->{"parent"});
52 }
53 $cur_node = $cur_node->{$char};
54 }
55 if (exists($cur_node->{"final"})) {
56 if ($override) {
57 $cur_node->{"final"} = $words->{$word};
58 next;
59 }
60 if ($args->{"checkduplicates"}) {
61 warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
62 "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
63 "current occurrence as \"$words->{$word}\" in " .
64 "table \"$table_name\.\n";
65 }
66 $cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
67 } else {
68 $cur_node->{"final"} = $words->{$word};
69 if ($args->{"checkduplicates"}) {
70 $cur_node->{"table_name"} = $table_name;
71 }
72 }
73 }
74 }
75
76 # Prompt user when no replacement has been found for a word
77 # $word is the word that was not found and $context* the context,
78 # $word_repl is the replacement word - this is only used when the window is
79 # called from the word choice window, since the original and
80 # replacement aren't the same then
81 # with context*_orig being the original, non-transliterated context.
82 # $table_paths is a mapping of table paths (here, only the keys, i.e.
83 # the actual paths, are used) to allow the user to choose a table to
84 # save a new replacement to.
85 # $cur_lineno is a display string to show the current line number
86 # $config_error is an optional flag to specify whether an error
87 # message should be displayed, informing the user that the config
88 # could not be loaded (used when "Reload config" is clicked)
89 # Returns: an array reference containing an action to be taken,
90 # in the form ["<action name>", <optional args>].
91 # See `handle_unknown_word_action` for currently accepted values
92 sub prompt_unknown_word {
93 # yes, this function really should take fewer arguments...
94 # it would be better to just pass the substrings and an index
95 my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig,
96 $config, $cur_lineno, $args, $config_error) = @_;
97 my $action;
98 my $stop = 0;
99
100 my $window = Gtk3::Window->new('toplevel');
101 $window->signal_connect(delete_event => sub {return FALSE});
102 $window->signal_connect(destroy => sub { Gtk3->main_quit; });
103 $window->set_border_width(10);
104
105 my $vbox = Gtk3::VBox->new(FALSE, 10);
106
107 my $linelabel = Gtk3::Label->new("Current line: $cur_lineno");
108 $vbox->pack_start($linelabel, FALSE, FALSE, 0);
109 $linelabel->show;
110
111 my $wordlabel = Gtk3::Label->new("Word not found: $word");
112 $wordlabel->set_alignment(0.0, 0.0);
113 $vbox->pack_start($wordlabel, FALSE, FALSE, 0);
114 $wordlabel->show;
115
116 # Make a text box with the given left and right context and label
117 # Also creates a button allowing the user to set the currently
118 # selected text as the word to be replaced - useful when only part
119 # of the entire word that was not found has to be replaced
120 my $make_context_box = sub {
121 # look, don't blame me for these miserably named variables...
122 my ($ctxtl, $wrd, $ctxtr, $lbl, $btn_lbl) = @_;
123 my $hbox = Gtk3::HBox->new(FALSE, 5);
124 my $label = Gtk3::Label->new_with_mnemonic($lbl);
125 my $text = Gtk3::TextView->new;
126 $label->set_mnemonic_widget($text);
127 $text->set_wrap_mode("word");
128 my $buffer = $text->get_buffer();
129 $buffer->set_text($ctxtr);
130 my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
131 my $start_iter = $buffer->get_start_iter();
132 $buffer->insert_with_tags($start_iter, $wrd, $highlight);
133 $start_iter = $buffer->get_start_iter();
134 $buffer->insert($start_iter, $ctxtl);
135 # set cursor position to beginning of word
136 my $cur_iter = $buffer->get_iter_at_offset(length($ctxtl));
137 $buffer->place_cursor($cur_iter);
138 my $button = Gtk3::Button->new($btn_lbl);
139 $button->signal_connect(
140 clicked => sub {
141 if (my ($start, $end) = $buffer->get_selection_bounds()) {
142 $word = $buffer->get_text($start, $end, FALSE);
143 $wordlabel->set_text("Selected: $word");
144 }
145 }, $window);
146 $hbox->pack_start($label, FALSE, FALSE, 0);
147 $hbox->pack_start($text, TRUE, TRUE, 0);
148 $vbox->pack_start($hbox, FALSE, FALSE, 0);
149 $hbox = Gtk3::HBox->new(FALSE, 5);
150 $hbox->pack_start($button, FALSE, FALSE, 0);
151 my $complete_text = $ctxtl . $wrd . $ctxtr;
152 $button = Gtk3::Button->new("Reset text");
153 $button->signal_connect(
154 clicked => sub {
155 $buffer->set_text($complete_text);
156 }, $window);
157 $hbox->pack_start($button, FALSE, FALSE, 0);
158 $vbox->pack_start($hbox, FALSE, FALSE, 0);
159 };
160 $make_context_box->(
161 $contextl_orig, $word, $contextr_orig,
162 "Ori_ginal context: ",
163 "Use _original selection as word"
164 );
165 $make_context_box->(
166 $contextl, $word_repl, $contextr,
167 "Transliterated _context: ",
168 "Use _transliterated selection as word"
169 );
170
171 my $hbox = Gtk3::HBox->new(FALSE, 5);
172 my $label = Gtk3::Label->new("Ignore: ");
173 $hbox->pack_start($label, FALSE, FALSE, 0);
174 my $button = Gtk3::Button->new("Th_is run");
175 $button->signal_connect(
176 clicked => sub {
177 $action = ["ignore", "run", $word];
178 $window->destroy;
179 }, $window);
180 $hbox->pack_start($button, FALSE, FALSE, 0);
181 $button = Gtk3::Button->new("_Permanently");
182 $button->signal_connect(
183 clicked => sub {
184 $action = ["ignore", "permanent", $word];
185 $window->destroy;
186 }, $window);
187 $hbox->pack_start($button, FALSE, FALSE, 0);
188 $button = Gtk3::Button->new("_Whole line");
189 $button->signal_connect(
190 clicked => sub {
191 $action = ["ignore", "wholeline"];
192 $window->destroy;
193 }, $window);
194 $hbox->pack_start($button, FALSE, FALSE, 0);
195 $vbox->pack_start($hbox, FALSE, FALSE, 0);
196
197 # AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE!
198 # Take note, kids - this is what happens when you keep adding
199 # features without rethinking your basic design.
200
201 $hbox = Gtk3::HBox->new(FALSE, 5);
202 $label = Gtk3::Label->new_with_mnemonic("Add to _list: ");
203 $hbox->pack_start($label, FALSE, FALSE, 0);
204 my $path_list = Gtk3::ComboBoxText->new;
205 $label->set_mnemonic_widget($path_list);
206 foreach my $path (sort keys %{$config->{"display_tables"}}) {
207 $path_list->append_text($path);
208 }
209 $hbox->pack_start($path_list, FALSE, FALSE, 0);
210 $vbox->pack_start($hbox, FALSE, FALSE, 0);
211
212 $hbox = Gtk3::HBox->new(FALSE, 5);
213 $label = Gtk3::Label->new_with_mnemonic("_Replacement: ");
214 $hbox->pack_start($label, FALSE, FALSE, 0);
215 my $replace_entry = Gtk3::Entry->new;
216 $label->set_mnemonic_widget($replace_entry);
217 $hbox->pack_start($replace_entry, TRUE, TRUE, 0);
218 $vbox->pack_start($hbox, FALSE, FALSE, 0);
219
220 if (exists $config->{"retrywithout"}) {
221 $hbox = Gtk3::HBox->new(FALSE, 5);
222 # Pressing Alt+e just activates the first of the retrywithout buttons
223 $label = Gtk3::Label->new_with_mnemonic("Retry without: ");
224 $hbox->pack_start($label, FALSE, FALSE, 0);
225 foreach my $without (@{$config->{"retrywithout"}}) {
226 $button = Gtk3::Button->new("$without->[0]");
227 $button->signal_connect(
228 clicked => sub {
229 my @chars = @{$without}[1..$#$without];
230 my $stripped = replace_strip_chars($config, $args, \@chars, $word);
231 # recombine substrings
232 my $repl_text = "";
233 $repl_text .= $_->[1] foreach @$stripped;
234 $replace_entry->set_text($repl_text);
235 }, $window);
236 $hbox->pack_start($button, FALSE, FALSE, 0);
237 }
238 $vbox->pack_start($hbox, FALSE, FALSE, 0);
239 }
240
241 $hbox = Gtk3::HBox->new(FALSE, 0);
242 $button = Gtk3::Button->new("_Add replacement");
243 $button->signal_connect(
244 clicked => sub {
245 if ($path_list->get_active != -1) {
246 $action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
247 $window->destroy;
248 }
249 }, $window);
250 $hbox->pack_start($button, FALSE, FALSE, 0);
251 $vbox->pack_start($hbox, FALSE, FALSE, 0);
252
253 $hbox = Gtk3::HBox->new(FALSE, 5);
254 $button = Gtk3::Button->new("_Stop processing");
255 $button->signal_connect(
256 clicked => sub {
257 $stop = 1;
258 $window->destroy;
259 }, $window);
260 $hbox->pack_start($button, FALSE, FALSE, 0);
261
262 $button = Gtk3::Button->new("Reload config");
263 $button->signal_connect(
264 clicked => sub {
265 $action = ["reload"];
266 $window->destroy;
267 }, $window);
268 $hbox->pack_start($button, FALSE, FALSE, 0);
269
270 if ($config_error) {
271 $label = Gtk3::Label->new("Error loading config; see terminal output for details");
272 $hbox->pack_start($label, FALSE, FALSE, 0);
273 }
274 $vbox->pack_start($hbox, FALSE, FALSE, 0);
275
276 $window->add($vbox);
277 $window->show_all;
278 Gtk3->main;
279
280 die "Processing stopped at line $cur_lineno\n" if $stop;
281
282 if (!$action) {
283 # This action isn't explicitly handled, but at least nothing
284 # breaks when the window is closed without selecting an action
285 $action = ["dummy"];
286 }
287 return $action;
288 }
289
290 # Prompt the user when a word has multiple replacement options (separated by $config->{choicesep})
291 # $cur_lineno - display string to show the current line number
292 # Returns:
293 # 3, if this window needs to be called again but nothing needs
294 # to be re-transliterated
295 # 1, if the line needs to be re-transliterated
296 # 0, if nothing needs to be done
297 sub prompt_choose_word {
298 my ($substrings, $config, $args, $cur_lineno) = @_;
299
300 # make a list of all substrings that contain multiple word options
301 my @replacements;
302 foreach (0..$#$substrings) {
303 if ($substrings->[$_]->[0] && $substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
304 # This ugly bit of code is here as a special case for transliterating
305 # Hindi to Urdu text - if there are *exactly* two choices and one
306 # contains diacritics but the other one doesn't, the one with diacritics
307 # is automatically used instead of prompting the user.
308 if (exists $config->{"targetdiacritics"}) {
309 my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1];
310 my @diacritics = @{$config->{"targetdiacritics"}};
311 if (@choices == 2) {
312 @choices = map {NFD($_)} @choices;
313 my $first_matches = grep {$choices[0] =~ /$_/} @diacritics;
314 my $second_matches = grep {$choices[1] =~ /$_/} @diacritics;
315 if ($first_matches && !$second_matches) {
316 $substrings->[$_]->[1] = $choices[0];
317 next;
318 } elsif (!$first_matches && $second_matches) {
319 $substrings->[$_]->[1] = $choices[1];
320 next;
321 }
322 }
323 }
324 # Format of the elements in @replacements:
325 # [<id of substrings in $substrings>, <replacement word>, <original string>]
326 push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
327 }
328 }
329 # no substrings have multiple options
330 return if (!@replacements);
331
332 my $stop = 0;
333 my $open_unknown = 0;
334 my $cur_replacement = 0;
335
336 my $window = Gtk3::Window->new('toplevel');
337 $window->signal_connect(delete_event => sub {return FALSE});
338 $window->signal_connect(destroy => sub { Gtk3->main_quit; });
339 $window->set_border_width(10);
340
341 my $vbox = Gtk3::VBox->new(FALSE, 0);
342
343 my $linelabel = Gtk3::Label->new("Current line: $cur_lineno");
344 $vbox->pack_start($linelabel, FALSE, FALSE, 0);
345
346 my $wordlabel = Gtk3::Label->new("");
347 $wordlabel->set_alignment(0.0, 0.0);
348 $vbox->pack_start($wordlabel, FALSE, FALSE, 0);
349
350 my $undo = Gtk3::Button->new("_Undo");
351 $vbox->pack_start($undo, FALSE, FALSE, 0);
352 $undo->set_sensitive(FALSE);
353
354 my $button_vbox = Gtk3::VBox->new(FALSE, 0);
355 $vbox->pack_start($button_vbox, FALSE, FALSE, 0);
356
357 my $accept = Gtk3::Button->new("_Accept changes?");
358 $vbox->pack_start($accept, FALSE, FALSE, 0);
359
360 my $hbox = Gtk3::HBox->new(FALSE, 0);
361 my $label = Gtk3::Label->new("Context: ");
362 my $text = Gtk3::TextView->new;
363 $text->set_wrap_mode("word");
364 my $buffer = $text->get_buffer();
365 my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
366 $text->set_editable(FALSE);
367 $hbox->pack_start($label, FALSE, FALSE, 0);
368 $hbox->pack_start($text, TRUE, TRUE, 10);
369 $vbox->pack_start($hbox, FALSE, FALSE, 10);
370
371 $hbox = Gtk3::HBox->new(FALSE, 5);
372 my $skip_button = Gtk3::Button->new("Sk_ip word");
373 $hbox->pack_start($skip_button, FALSE, FALSE, 0);
374 my $unknown_button = Gtk3::Button->new("_Open in unknown word window");
375 $hbox->pack_start($unknown_button, FALSE, FALSE, 0);
376 my $stop_button = Gtk3::Button->new("_Stop processing");
377 $hbox->pack_start($stop_button, FALSE, FALSE, 0);
378 $vbox->pack_start($hbox, FALSE, FALSE, 0);
379
380 # generate the context to the left and to the right of the current word being replaced
381 my $get_context = sub {
382 my ($contextl, $contextr) = ("", "");
383 my $tmp_replacement = 0;
384 foreach (0..$#$substrings) {
385 my $word = $substrings->[$_]->[1];
386 if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
387 $word = $replacements[$tmp_replacement]->[1];
388 $tmp_replacement++;
389 }
390 # When nothing is left to replace, the entire string is in $contextl
391 if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
392 $contextl .= $word;
393 } elsif ($_ > $replacements[$cur_replacement]->[0]) {
394 $contextr .= $word;
395 }
396 }
397 return ($contextl, $contextr);
398 };
399
400 # fill the text buffer with the context and current word, highlighting the word
401 # if $cur_replacement is after the end of @replacements, don't highlight anything
402 # (this happens when all words have been replaced and the user only needs to accept the changes)
403 my $fill_text_buffer = sub {
404 my $start = $buffer->get_start_iter();
405 my $end = $buffer->get_end_iter();
406 $buffer->delete($start, $end);
407 my ($contextl, $contextr) = $get_context->();
408 $buffer->set_text($contextr);
409 if ($cur_replacement <= $#replacements) {
410 $start = $buffer->get_start_iter();
411 $buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
412 }
413 $start = $buffer->get_start_iter();
414 $buffer->insert($start, $contextl);
415 };
416
417 my $show_accept = sub {
418 $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
419 $accept->show;
420 $accept->grab_focus;
421 $wordlabel->set_text("");
422 $skip_button->set_sensitive(FALSE);
423 $unknown_button->set_sensitive(FALSE);
424 };
425
426 my $fill_button_vbox; # forward-declaration so it can be used here already
427 my $next_word = sub {
428 $undo->set_sensitive(TRUE);
429 $cur_replacement++;
430 $fill_text_buffer->();
431 if ($cur_replacement > $#replacements) {
432 $show_accept->();
433 return;
434 }
435 $fill_button_vbox->();
436 };
437
438 # fill $button_vbox with the word options for the current word
439 $fill_button_vbox = sub {
440 $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
441 my $word = $replacements[$cur_replacement]->[1];
442 $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
443 my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
444 if (exists $config->{"targetdiacritics"}) {
445 # This nasty bit of code finds the number of diacritics in every
446 # choice and sorts the choice in descending order based on that
447 my %choice_nums;
448 foreach my $choice (@choices) {
449 $choice_nums{$choice} = 0;
450 foreach my $diacritic (@{$config->{"targetdiacritics"}}) {
451 my @matches = NFD($choice) =~ /$diacritic/;
452 $choice_nums{$choice} += scalar @matches if @matches;
453 }
454 }
455 @choices = sort {$choice_nums{$b} <=> $choice_nums{$a}} @choices;
456 }
457 my $cur_num = 1;
458 foreach my $word_choice (@choices) {
459 # the mnemonics don't make sense when the number has more than one digit
460 my $choice_label = $cur_num <= 9 ? "_$cur_num: $word_choice" : $word_choice;
461 my $button = Gtk3::Button->new($choice_label);
462 $button->signal_connect(
463 clicked => sub {
464 $replacements[$cur_replacement]->[1] = $word_choice;
465 $next_word->();
466 }, $window);
467 $button_vbox->pack_start($button, FALSE, FALSE, 0);
468 $button->show;
469 $cur_num++;
470 }
471 };
472
473 $undo->signal_connect(
474 clicked => sub {
475 if ($cur_replacement > 0) {
476 $cur_replacement--;
477 if ($cur_replacement == 0) {
478 $undo->set_sensitive(FALSE);
479 }
480 $replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
481 $fill_button_vbox->();
482 $fill_text_buffer->();
483 $accept->hide;
484 $skip_button->set_sensitive(TRUE);
485 $unknown_button->set_sensitive(TRUE);
486 my $word = $replacements[$cur_replacement]->[1];
487 $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
488 }
489 }, $window);
490
491 $accept->signal_connect(
492 clicked => sub {
493 # write the changes to the original $substrings
494 foreach (@replacements) {
495 $substrings->[$_->[0]]->[1] = $_->[1];
496 }
497 $window->destroy;
498 }, $window);
499
500 $skip_button->signal_connect(clicked => $next_word, $window);
501
502 $unknown_button->signal_connect(
503 clicked => sub {
504 $open_unknown = 1;
505 $window->destroy;
506 }, $window);
507
508 $stop_button->signal_connect(
509 clicked => sub {
510 $stop = 1;
511 $window->destroy;
512 }, $window);
513
514 $fill_button_vbox->();
515 $fill_text_buffer->();
516
517 $window->add($vbox);
518 $window->show_all;
519 $accept->hide;
520 Gtk3->main;
521 die "Processing stopped at line $cur_lineno\n" if $stop;
522 if ($open_unknown) {
523 my $ret = call_unknown_word_window(
524 $substrings, $replacements[$cur_replacement]->[0],
525 $config, $args, $cur_lineno);
526 # the word choice window still needs to be called again
527 # when 0 is returned
528 return 3 if $ret == 0;
529 return $ret;
530 }
531 return 0;
532 }
533
534 my $ID = 0;
535 my $STRING = 1;
536
537 # Parse the configuration file into data type (currently only ID and STRING)
538 sub parse_config {
539 my $f = shift;
540 my $fh;
541 if (!open($fh, "<", $f)) {
542 warn "Can't open config file \"$f\"!\n";
543 return;
544 }
545 my @commands;
546 my $state = 0;
547 my $IN_ID = 1;
548 my $IN_STR = 2;
549 my $cur_val = "";
550 while (my $line = <$fh>) {
551 chomp($line);
552 $state = 0;
553 push(@commands, []);
554 foreach my $char (split(//, $line)) {
555 if ($char eq "#" && !($state & $IN_STR)) {
556 last;
557 } elsif ($char eq '"') {
558 if ($state & $IN_STR) {
559 push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
560 $cur_val = "";
561 $state &= ~$IN_STR;
562 } else {
563 $cur_val = "";
564 $state |= $IN_STR;
565 }
566 } elsif ($char eq " ") {
567 if ($state & $IN_ID) {
568 push(@{$commands[-1]}, {type => $ID, value => $cur_val});
569 $state &= ~$IN_ID;
570 $cur_val = "";
571 } elsif ($state) {
572 $cur_val .= $char;
573 }
574 } else {
575 if (!$state) {
576 $state |= $IN_ID;
577 }
578 $cur_val .= $char;
579 }
580 }
581 if ($state & $IN_STR) {
582 warn "ERROR: Unterminated string in config:\n$line";
583 return;
584 } elsif ($cur_val) {
585 push(@{$commands[-1]}, {type => $ID, value => $cur_val});
586 $cur_val = "";
587 }
588 if ($#{$commands[-1]} == -1) {
589 pop(@commands);
590 }
591 }
592 close($fh);
593
594 return \@commands;
595 }
596
597 # if the path is relative, find its absolute location based
598 # on the location of the config file
599 sub open_file_rel_abs {
600 my ($filename, $config_file, $mode) = @_;
601 $mode //= "<";
602 if (!file_name_is_absolute $filename) {
603 my $config_dir = dirname $config_file;
604 $filename = rel2abs($filename, $config_dir);
605 }
606 my $fh;
607 if (!open $fh, $mode, $filename) {
608 warn "Can't open file \"$filename\"!\n";
609 return;
610 }
611 return $fh;
612 }
613
614 # Load a file of replacement words into a hash table
615 sub load_table {
616 my ($filename, $args, $config, $revert) = @_;
617 my $fh = open_file_rel_abs $filename, $args->{"config"};
618 return if !$fh;
619 my %table;
620 while (my $line = <$fh>) {
621 chomp $line;
622 next if (!$line);
623 my @words = split(/\Q$config->{tablesep}\E/, $line);
624 if (@words != 2) {
625 warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
626 close $fh;
627 return;
628 }
629 my $word;
630 my $replacement;
631 if ($revert) {
632 $word = NFD $words[1];
633 $replacement = $words[0];
634 } else {
635 $word = NFD $words[0];
636 $replacement = $words[1];
637 }
638 my @word_choices = split /\Q$config->{choicesep}\E/, $word;
639 foreach my $word_choice (@word_choices) {
640 if (exists $table{$word_choice}) {
641 if ($args->{"checkduplicates"}) {
642 warn "WARNING: Duplicate word in file \"$filename\": " .
643 "\"$word_choice\", with replacement \"$replacement\", " .
644 "already exists with replacement \"$table{$word_choice}\".\n";
645 }
646 $table{$word_choice} = get_unique_words(
647 $table{$word_choice} .
648 $config->{choicesep} .
649 $replacement, $config);
650 } else {
651 $table{$word_choice} = $replacement;
652 }
653 }
654 }
655 close $fh;
656 return \%table;
657 }
658
659 # Load table for words to ignore - only the keys matter, since there is no replacement
660 sub load_ignore_table {
661 my ($filename, $args) = @_;
662 my $fh = open_file_rel_abs $filename, $args->{"config"};
663 return if !$fh;
664 my %table;
665 while (my $line = <$fh>) {
666 chomp $line;
667 $table{NFD($line)} = "" if $line;
668 }
669 close $fh;
670 return \%table;
671 }
672
673 # Generate all forms of a word by combining it with endings
674 # Returns:
675 # 0 - an error occurred
676 # 1 - everything's fine
677 sub expand_table {
678 my ($table, $forms, $noroot, $config) = @_;
679 my %new_table;
680 foreach my $word (keys %$table) {
681 foreach my $ending (keys %$forms) {
682 # Some words and/or endings have multiple options, separated by $config->{choicesep}
683 # These must be temporarily separated in order to properly generate the forms
684 my @word_options;
685 my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
686 my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
687 foreach my $stem_option (@stem_options) {
688 foreach my $ending_option (@ending_options) {
689 push(@word_options, $stem_option . $ending_option);
690 }
691 }
692 $new_table{$word . $ending} = join($config->{choicesep}, @word_options);
693 }
694 $new_table{$word} = $table->{$word} if !$noroot;
695 }
696 return \%new_table;
697 }
698
699 # Check if the number and types of arguments given to a config command are right
700 # Returns:
701 # undef - the arguments don't match
702 # 1 - the arguments match
703 sub check_args {
704 my ($args, $cmd) = @_;
705 my $cmd_name = $cmd->[0]->{"value"};
706 if ($#$cmd - 1 < $#$args) {
707 my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
708 foreach my $arg (@{$cmd}[1..$#$cmd]) {
709 $err .= " " . $arg->{"value"}
710 }
711 warn "$err\n";
712 return;
713 }
714 my $arg_num = 0;
715 while ($arg_num <= $#$args) {
716 if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
717 my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
718 $err .= "Expected:";
719 foreach my $arg_type (@$args) {
720 $err .= " ID" if ($arg_type == $ID);
721 $err .= " STRING" if ($arg_type == $STRING);
722 }
723 $err .= "\nReceived:";
724 foreach my $arg (@{$cmd}[1..$#$cmd]) {
725 $err .= " ID" if ($arg->{"type"} == $ID);
726 $err .= " STRING" if ($arg->{"type"} == $STRING);
727 }
728 warn "$err\n";
729 return;
730 }
731 $arg_num++;
732 }
733 return 1;
734 }
735
736 # Interpret the config file - load and expand tables, etc.
737 # $config_list - the list returned by parse_config
738 sub interpret_config {
739 my ($config_list, $args) = @_;
740 my %tables;
741 my %config;
742 # table_paths stores a list of all table and replacement ids that are
743 # impacted by the path so the replacement can be added on the fly when
744 # a new replacement is added from the GUI
745 # the "replacement id" is just the number of the replacement group,
746 # starting at 0 with the first group in the config
747 $config{"table_paths"} = {};
748 # reverted_tables stores a hash of the paths of all tables that are
749 # reverted so that replacements added from the GUI are added in the
750 # right order
751 $config{"reverted_tables"} = {};
752 # these are the paths of the tables that are displayed in the GUI
753 $config{"display_tables"} = {};
754 # a mapping between the table ids and tables for all tables used as
755 # ending tables in expand statements - so expansions can be done
756 # on the fly when adding a replacement word from the GUI
757 $config{"ending_tables"} = {};
758 # ignore is the path to the ignore file, ignore_words the actual table
759 $config{"ignore"} = "";
760 $config{"ignore_words"} = {};
761 $config{"split"} = "\\s+";
762 $config{"beforeword"} = "\\s";
763 $config{"afterword"} = "\\s";
764 $config{"tablesep"} = "\t";
765 $config{"choicesep"} = "\$";
766 # a list of "replacement configs", which specify the type and any
767 # other arguments (this is given to replace_match, etc.
768 $config{"replacements"} = [];
769 # these are temporary mappings used while loading the config
770 my %path_to_table;
771 my %table_id_to_path;
772 my %mandatory_args = (
773 "ignore" => [$STRING],
774 "table" => [$ID],
775 "expand" => [$ID, $ID],
776 "match" => [$STRING, $STRING],
777 "matchignore" => [$STRING],
778 "replace" => [$ID],
779 "split" => [$STRING],
780 "beforeword" => [$STRING],
781 "afterword" => [$STRING],
782 "tablesep" => [$STRING],
783 "choicesep" => [$STRING],
784 "comment" => [$STRING],
785 "group" => [],
786 "endgroup" => [],
787 "retrywithout" => [$STRING],
788 "targetdiacritics" => [$STRING]
789 );
790 my $in_group = 0;
791 foreach my $cmd (@$config_list) {
792 # All load statements must be before expand statements
793 # All expand, beforeword, and afterword statements must be before replace statements
794 my $cmd_name = $cmd->[0]->{"value"};
795 if ($cmd->[0]->{"type"} == $ID) {
796 if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
797 warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
798 return;
799 }
800 return if !check_args($mandatory_args{$cmd_name}, $cmd);
801 if ($cmd_name eq "table") {
802 my $table_path = $cmd->[2]->{"value"};
803 my %table_args;
804 foreach (3..$#$cmd) {
805 $table_args{$cmd->[$_]->{"value"}} = 1;
806 }
807 my $table;
808 # add to temporary path-to-table mapping so tables aren't
809 # loaded unnecessarily
810 if (exists $path_to_table{$table_path}) {
811 $table = $path_to_table{$table_path};
812 } else {
813 $table = load_table $table_path, $args, \%config, $table_args{"revert"};
814 return if !defined $table;
815 $path_to_table{$table_path} = $table;
816 }
817 if ($table_args{"revert"}) {
818 $config{"reverted_tables"}->{$table_path} = 1;
819 }
820 my $table_id = $cmd->[1]->{"value"};
821 $tables{$table_id} = $table;
822 $table_id_to_path{$table_id} = $table_path;
823 # this is a hash to avoid duplicates if the same file
824 # is loaded multiple times
825 $config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"};
826 } elsif ($cmd_name eq "expand") {
827 my $orig_table_id = $cmd->[1]->{"value"};
828 my $ending_table_id = $cmd->[2]->{"value"};
829 my $noroot = 0;
830 if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") {
831 $noroot = 1;
832 }
833 if (!exists $tables{$orig_table_id}) {
834 warn "expand: table \"$orig_table_id\" doesn't exist\n";
835 return;
836 } elsif (!exists $tables{$ending_table_id}) {
837 warn "expand: table \"$ending_table_id\" doesn't exist\n";
838 return;
839 }
840
841 $config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id};
842 $config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id};
843 push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot];
844
845 my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config);
846 return if !$new_table;
847 $tables{$orig_table_id} = $new_table;
848 } elsif ($cmd_name eq "group") {
849 if ($in_group) {
850 warn "ERROR: New group started without ending last one in config\n";
851 return;
852 }
853 push @{$config{"replacements"}}, {
854 "type" => "group", "tables" => [],
855 "words" => {}, "options" => {}};
856 # add all options such as "endword" to the options hash
857 for (1..$#$cmd) {
858 $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
859 }
860 $in_group = 1;
861 } elsif ($cmd_name eq "endgroup") {
862 if (!$in_group) {
863 warn "ERROR: endgroup command called while not in group\n";
864 return;
865 }
866 $in_group = 0;
867 } elsif ($cmd_name eq "match") {
868 if ($in_group) {
869 warn "ERROR: match command is invalid inside group\n";
870 return;
871 }
872 push @{$config{"replacements"}}, {
873 "type" => "match",
874 "options" => {},
875 "search" => NFD($cmd->[1]->{"value"}),
876 "replace" => $cmd->[2]->{"value"}};
877 for (3..$#$cmd) {
878 # add optional arguments as keys in options hash
879 $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
880 }
881 } elsif ($cmd_name eq "matchignore") {
882 if ($in_group) {
883 warn "ERROR: matchignore command is invalid inside group\n";
884 return;
885 }
886 push @{$config{"replacements"}}, {
887 "type" => "match",
888 "options" => {},
889 "search" => NFD($cmd->[1]->{"value"})};
890 for (2..$#$cmd) {
891 $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
892 }
893 } elsif ($cmd_name eq "replace") {
894 if (!$in_group) {
895 warn "ERROR: replace command called while not in group\n";
896 return;
897 }
898 my $table = $cmd->[1]->{"value"};
899 if (!exists($tables{$table})) {
900 warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
901 return;
902 }
903
904 # make a list of all replacements that are affected by this
905 # file so that they can be updated when a word is added
906 # through the gui
907 my $table_path = $table_id_to_path{$table};
908 my $replacement_id = $#{$config{"replacements"}};
909 $config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
910 push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];
911
912 # store list of tables for --debug
913 push @{$config{"replacements"}->[-1]->{"tables"}}, $table;
914
915 # Note: we don't need to check if $table{"choicesep"} was defined
916 # here since we can't ever get this far without first having
917 # loaded a table anyways
918 my $trie_root = $config{"replacements"}->[-1]->{"words"};
919 my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override";
920 add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override);
921 } elsif ($cmd_name eq "retrywithout") {
922 if (!exists $config{"retrywithout"}) {
923 $config{"retrywithout"} = [];
924 }
925 # first value is the display name
926 my @values = map {$_->{"value"}} @{$cmd}[1..$#$cmd];
927 push @{$config{"retrywithout"}}, \@values;
928 } elsif ($cmd_name eq "targetdiacritics") {
929 if (!exists $config{$cmd_name}) {
930 $config{$cmd_name} = [];
931 }
932 foreach (1..$#$cmd) {
933 push @{$config{$cmd_name}}, $cmd->[$_]->{"value"};
934 }
935 } elsif ($cmd_name eq "split" || $cmd_name eq "beforeword" ||
936 $cmd_name eq "afterword" || $cmd_name eq "tablesep" ||
937 $cmd_name eq "choicesep" || $cmd_name eq "comment") {
938 $config{$cmd_name} = $cmd->[1]->{"value"};
939 } elsif ($cmd_name eq "ignore") {
940 $config{"ignore"} = $cmd->[1]->{"value"};
941 my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
942 return if !defined $table;
943 $config{"ignore_words"} = $table;
944 } else {
945 warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n";
946 return;
947 }
948 } else {
949 my $err = "ERROR: line does not start with command:\n";
950 foreach my $cmd_part (@$cmd) {
951 $err .= $cmd_part->{"value"};
952 }
953 warn "$err\n";
954 return;
955 }
956 }
957 if ($in_group) {
958 warn "ERROR: unclosed group in config\n";
959 return;
960 }
961 if (!$config{"ignore"}) {
962 warn "ERROR: no file of words to ignore specified.\n";
963 return;
964 }
965 if ($args->{"dumptables"}) {
966 foreach my $table_id (keys %tables) {
967 my $table_path = $table_id_to_path{$table_id};
968 if ($config{"display_tables"}->{$table_path}) {
969 for my $word (keys %{$tables{$table_id}}) {
970 print NFC($word) . "\n";
971 }
972 }
973 }
974 }
975 return \%config;
976 }
977
978 # load the config file
979 # Returns:
980 # the config hash or undef if an error occurred
981 sub load_config {
982 my $args = shift;
983 my $config_list = parse_config($args->{"config"});
984 if (!$config_list) {
985 return;
986 }
987 return interpret_config $config_list, $args;
988 }
989
990 # Handle the action returned by `prompt_unknown_word`
991 # $config - the current program config
992 # $args - the command line arguments
993 # Currently accepted values for $action:
994 # ["ignore", "run", $word] - only ignore $word for the rest of this run
995 # ["ignore", "permanent", $word] - write $word to the permanent ignore file
996 # ["add", $word, $replace_word, $table_path] - add $word to the table
997 # corresponding to $table_path with $replace_word as its replacement. Note that
998 # only tables directly corresponding to paths work here - tables that only
999 # were created through "expand" in the config aren't ever shown separately
1000 # in `prompt_unknown_word`
1001 # ["reload"] - reload the configuration file
1002 # Returns:
1003 # 0 - nothing needs to be done
1004 # 1 - the current line needs to be re-transliterated with the new config
1005 # 2 - an error occurred while reloading the config
1006 # 3 - stop asking for unknown words on this line
1007 sub handle_unknown_word_action {
1008 my ($action, $config, $args) = @_;
1009 if ($action->[0] eq "ignore") {
1010 # yeah, this is a bit messy and inconsistent
1011 return 3 if $action->[1] eq "wholeline";
1012 $config->{"ignore_words"}->{$action->[2]} = "";
1013 if ($action->[1] eq "permanent") {
1014 my $fh = open_file_rel_abs $config->{"ignore"}, $args->{"config"}, ">>";
1015 return 1 if !$fh;
1016 print($fh $action->[2] . "\n");
1017 close($fh);
1018 } elsif ($action->[1] eq "run") {
1019 # Print to error file if ignore isn't permanent
1020 return 0 if ($args->{"errors"} eq "");
1021 my $fh;
1022 if (!open($fh, ">>", $args->{"errors"})) {
1023 warn "ERROR: Can't open error file \"$args->{errors}\".\n";
1024 return 0;
1025 }
1026 print($fh $action->[2] . "\n");
1027 close($fh);
1028 }
1029 return 0;
1030 } elsif ($action->[0] eq "add") {
1031 my $table_path = $action->[3];
1032 my $word = $action->[1];
1033 my $replace_word = $action->[2];
1034 # make sure to write the words in the correct order if the
1035 # tables were reverted while loading
1036 my $reverted = exists $config->{"reverted_tables"}->{$table_path};
1037 my $word_abs = $reverted ? $action->[2] : $action->[1];
1038 my $replace_word_abs = $reverted ? $action->[1] : $action->[2];
1039 my $fh = open_file_rel_abs $table_path, $args->{"config"}, ">>";
1040 return 1 if !$fh;
1041 print($fh $word_abs . $config->{tablesep} . $replace_word_abs . "\n");
1042 close($fh);
1043 # loop over all table ids that are impacted by this file
1044 foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) {
1045 my $replacement_id = $replacement->[0];
1046 my $table_id = $replacement->[1];
1047 my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
1048 my $final_table = {$word => $replace_word};
1049 # handle expansions
1050 foreach my $expand (@{$config->{"expands"}->{$table_id}}) {
1051 my $ending_table_id = $expand->[0];
1052 my $noroot = $expand->[1];
1053 my $endings_table = $config->{"ending_tables"}->{$ending_table_id};
1054 $final_table = expand_table $final_table, $endings_table, $noroot, $config;
1055 }
1056 add_to_trie($table_id, $trie, $final_table, $args, $config);
1057 }
1058 return 1;
1059 } elsif ($action->[0] eq "reload") {
1060 my $new_config = load_config $args;
1061 if ($new_config) {
1062 %$config = %$new_config;
1063 return 1;
1064 } else {
1065 return 2;
1066 }
1067 }
1068 }
1069
1070 # Split $substrings based on the "split" regex in $config.
1071 # $substrings can already be split at this point; only the
1072 # ones that haven't been transliterated yet are modified
1073 sub split_words {
1074 my ($config, $substrings) = @_;
1075 my $split_re = qr/($config->{"split"})/;
1076 my @substrings_new;
1077 foreach my $cur_substr (@$substrings) {
1078 if ($cur_substr->[0] == 1) {
1079 push(@substrings_new, $cur_substr);
1080 next;
1081 }
1082
1083 my @words = split(/$split_re/, $cur_substr->[1]);
1084 for my $i (0..$#words) {
1085 # Word is not delimiter
1086 # Split produces an empty field at the beginning if the string
1087 # starts with the delimiter
1088 if ($i % 2 == 0) {
1089 push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
1090 } else {
1091 # Delimiters can count as already replaced
1092 push(@substrings_new, [1, $words[$i], $words[$i]]);
1093 }
1094 }
1095 }
1096 @$substrings = @substrings_new;
1097 }
1098
1099 # small helper function to add a untransliterated string to the last substring
1100 # if that is not transliterated yet, or push it onto @$substrings otherwise
1101 # -> used to keep all untransliterated text in one piece
1102 # since this is also used for the "nofinal" attribute on "match", it takes
1103 # an original and replaced string (since, when using "match" and "nofinal",
1104 # the original string was replaced, but is still marked as unknown)
1105 sub push_unknown {
1106 my ($substrings, $orig, $replaced) = @_;
1107 $replaced //= $orig;
1108 if (@$substrings && !$substrings->[-1]->[0]) {
1109 $substrings->[-1]->[1] .= $replaced;
1110 $substrings->[-1]->[2] .= $orig;
1111 } else {
1112 push(@$substrings, [0, $replaced, $orig]);
1113 }
1114 }
1115
1116 # Replace a word in $substrings based on $replace_config using regex
1117 # $replace_config->{"search"} is the word to replace
1118 # $replace_config->{"replace"} is the replacement word
1119 # if $replace_config->{"replace"} is undefined, just splits
1120 # $substrings at the the match and marks that the match has
1121 # been transliterated - currently used for "matchignore"
1122 # $replace_config->{"beginword"}, $replace_config->{"afterword"} -
1123 # specifies if the match is only valid when $config->{"beforeword"}
1124 # or $config->{"afterword"} occur before or after it, respectively
1125 sub replace_match {
1126 my ($config, $replace_config, $substrings, $debug_msg) = @_;
1127 my $beginword = exists $replace_config->{"options"}->{"beginword"};
1128 my $endword = exists $replace_config->{"options"}->{"endword"};
1129 my $fullword = $beginword && $endword;
1130 my $beforeword = $config->{"beforeword"};
1131 my $afterword = $config->{"afterword"};
1132 my $word = $replace_config->{"search"};
1133 my $replace_word = $replace_config->{"replace"};
1134 if ($fullword) {
1135 $word = qr/(\A|$beforeword)$word(\z|$afterword)/;
1136 } elsif ($beginword) {
1137 $word = qr/(\A|$beforeword)$word/;
1138 } elsif ($endword) {
1139 $word = qr/$word(\z|$afterword)/;
1140 } else {
1141 $word = qr/$word/;
1142 }
1143
1144 my @substrings_new;
1145 # only modify $substrings at all if the word was found
1146 my $found_word = 0;
1147 my $last_idx;
1148 # @substrings_new is only used if needed to improve efficiency
1149 foreach my $i (0..$#$substrings) {
1150 if ($substrings->[$i]->[0]) {
1151 # FIXME: is there a way to make it more efficient by keeping the old array?
1152 # This is a major bottleneck
1153 # Note: the above statement *may* be a bit exaggerated
1154 if ($found_word) {
1155 push(@substrings_new, $substrings->[$i]);
1156 }
1157 next;
1158 }
1159 $last_idx = 0;
1160 my $i0 = 0;
1161 my $i1 = 0;
1162 while ($substrings->[$i]->[1] =~ m/$word/g) {
1163 if (!$found_word) {
1164 print $debug_msg if $debug_msg;
1165 $found_word = 1;
1166 if ($i != 0) {
1167 push(@substrings_new, @{$substrings}[0..$i-1]);
1168 }
1169 }
1170 # This mess is needed to reliably match $beforeword and $afterword and put the captured
1171 # "splitting" characters back into the text. This would be much easier just using
1172 # a lookbehind and lookahead, but I couldn't find a way to also match beginning and
1173 # end of string that way.
1174 $i0 = $-[0];
1175 $i1 = $+[0];
1176 if ($fullword) {
1177 $i0 += length($1);
1178 $i1 -= length($2);
1179 # pos need to be decreased so that matches still work right next to each other
1180 pos($substrings->[$i]->[1]) -= length($2);
1181 } elsif ($beginword) {
1182 $i0 += length($1);
1183 } elsif ($endword) {
1184 $i1 -= length($1);
1185 pos($substrings->[$i]->[1]) -= length($1);
1186 }
1187 if ($last_idx != $i0) {
1188 my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
1189 push_unknown \@substrings_new, $unknown;
1190 }
1191 my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
1192 my $replace_str = $replace_word // $orig_str;
1193 if ($replace_config->{"options"}->{"nofinal"}) {
1194 warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
1195 push_unknown \@substrings_new, $orig_str, $replace_str;
1196 } else {
1197 warn "Replaced \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
1198 push(@substrings_new, [1, $replace_str, $orig_str]);
1199 }
1200 $last_idx = $i1;
1201 }
1202 if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
1203 my $unknown = substr($substrings->[$i]->[1], $last_idx);
1204 push_unknown \@substrings_new, $unknown;
1205 }
1206 }
1207 if ($found_word) {
1208 @$substrings = @substrings_new;
1209 }
1210 }
1211
1212 # Replace a group, i.e. replace all the words in a trie
1213 # $replace_config->{"words"} - the root node of the trie
1214 # $replace_config->{"beginword"}, $replace_config->{"endword"} -
1215 # same as in `replace_match`
1216 sub replace_group {
1217 my ($config, $replace_config, $substrings, $debug_msg) = @_;
1218 my @substrings_new;
1219 my $word_found = 0;
1220 # Recurse backwords towards the root node of the trie to find the first
1221 # node with a key "final" which satisfies the ending condition (if "endword" is set)
1222 # Returns the id *after* the last match and the node that was found
1223 # with the key "final" (or undef, if nothing matched)
1224 my $find_final = sub {
1225 my ($i, $tmp_cur_node, $s) = @_;
1226 do {
1227 my $after_ch = substr($s->[1], $i, 1);
1228 if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) ||
1229 $after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
1230 return ($i, $tmp_cur_node);
1231 }
1232 $i--;
1233 } while ($tmp_cur_node = $tmp_cur_node->{"parent"});
1234 # none of the points were appropriate for breaking the word, so
1235 # $tmp_cur_node now points to the nonexistent parent node of the
1236 # root node
1237 return ($i, undef);
1238 };
1239 foreach my $s (@$substrings) {
1240 if ($s->[0]) {
1241 push(@substrings_new, $s);
1242 next;
1243 }
1244 my $cur_node = $replace_config->{"words"};
1245 my $start_i = 0;
1246 my $i = 0;
1247 # This deliberately goes off the end of the string! $cur_node is always "one behind" $i
1248 # since the node is only advanced in the iteration *after* $i has increased, meaning that
1249 # $i has to already be after the end of the string for the first if statement to definitely
1250 # fail, causing the elsif statement to handle that case
1251 while ($i <= length($s->[1])) {
1252 # This works even when $i is one index after the end of the string - it just returns "" then
1253 my $ch = substr($s->[1], $i, 1);
1254 if (exists $cur_node->{$ch}) {
1255 if ($cur_node == $replace_config->{"words"}) {
1256 my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
1257 if (exists($replace_config->{"options"}->{"beginword"}) &&
1258 $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
1259 push_unknown \@substrings_new, $ch;
1260 $i++;
1261 next;
1262 }
1263 $start_i = $i;
1264 }
1265 $cur_node = $cur_node->{$ch};
1266 } elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
1267 my $tmp_cur_node = $cur_node;
1268 ($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
1269 if (!defined($tmp_cur_node)) {
1270 push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
1271 $i += 2;
1272 } else {
1273 my $orig = substr($s->[1], $start_i, $i-$start_i);
1274 my $final = $tmp_cur_node->{"final"};
1275 if ($debug_msg) {
1276 if (!$word_found) {
1277 warn $debug_msg;
1278 $word_found = 1;
1279 }
1280 warn "Replaced \"$orig\" with \"$final\"\n";
1281 }
1282 push(@substrings_new, [1, $final, $orig]);
1283 }
1284 $cur_node = $replace_config->{"words"};
1285 next;
1286 } else {
1287 push_unknown \@substrings_new, $ch;
1288 }
1289 $i++;
1290 }
1291 }
1292 @$substrings = @substrings_new;
1293 }
1294
1295 # Perform all replacements on $word, first removing all
1296 # characters specified in $chars
1297 sub replace_strip_chars {
1298 my ($config, $args, $chars, $word) = @_;
1299 foreach my $char (@$chars) {
1300 $word =~ s/\Q$char\E//g;
1301 }
1302 return replace_line($config, $args, $word);
1303 }
1304
1305 # Perform all replacements on $line based on $config
1306 # $substrings: array of arrays - each one has three elements:
1307 # first 0 or 1, indicating if the substring following it has already
1308 # been replaced or not (1 means it has been replaced), then the
1309 # transliterated string, and lastly the original string.
1310 # If the first element is 0, the second two elements are obviously same
1311 sub replace_line {
1312 my ($config, $args, $line) = @_;
1313 my $substrings = [[0, $line, $line]];
1314 foreach my $replacement (@{$config->{"replacements"}}) {
1315 if ($replacement->{"type"} eq "match") {
1316 my $debug_msg;
1317 if ($args->{"debug"}) {
1318 my $options = join " ", keys(%{$replacement->{"options"}});
1319 $debug_msg = "Match ($options): \"$replacement->{search}\"";
1320 if ($replacement->{"replace"}) {
1321 $debug_msg .= " \"$replacement->{replace}\"\n";
1322 } else {
1323 $debug_msg .= " (ignore)\n";
1324 }
1325 }
1326 replace_match($config, $replacement, $substrings, $debug_msg);
1327 } elsif ($replacement->{"type"} eq "group") {
1328 my $debug_msg;
1329 if ($args->{"debug"}) {
1330 my $options = join " ", keys(%{$replacement->{"options"}});
1331 my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"';
1332 $debug_msg = "Group ($options): $tables\n";
1333 }
1334 replace_group($config, $replacement, $substrings, $debug_msg);
1335 }
1336 }
1337 # splits all words at the end so that the splitting characters
1338 # aren't taken as unknown words and the unknown words are (hopefully)
1339 # in better chunks for prompting the user about them
1340 split_words($config, $substrings);
1341
1342 return $substrings;
1343 }
1344
1345 # Call the unknown word window with the given substrings and index
1346 # See `get_unknown_words` for explanation of other parameters
1347 # (should be obvious)
1348 # Returns:
1349 # 3, if the rest of the line should be skipped
1350 # 1, if the line needs to be re-transliterated
1351 # 0, if nothing needs to be done
1352 sub call_unknown_word_window {
1353 my ($substrings, $i, $config, $args, $cur_lineno) = @_;
1354 my $word = $substrings->[$i];
1355 my $contextl = "";
1356 my $contextl_orig = "";
1357 foreach my $j (0..$i-1) {
1358 $contextl .= $substrings->[$j]->[1];
1359 $contextl_orig .= $substrings->[$j]->[2];
1360 }
1361 my $contextr = "";
1362 my $contextr_orig = "";
1363 foreach my $j ($i+1..$#$substrings) {
1364 $contextr .= $substrings->[$j]->[1];
1365 $contextr_orig .= $substrings->[$j]->[2];
1366 }
1367 my $action = prompt_unknown_word($contextl, $contextl_orig,
1368 $word->[1], $word->[2], $contextr, $contextr_orig,
1369 $config, "$cur_lineno", $args);
1370 # if $ret == 3, rest of line should be skipped
1371 # if $ret == 2, config could not be loaded
1372 # if $ret == 1, line must be redone with new config
1373 my $ret = handle_unknown_word_action($action, $config, $args);
1374 # keep retrying until the user chooses an action which
1375 # didn't throw an error
1376 while ($ret == 2) {
1377 $action = prompt_unknown_word($contextl, $contextl_orig,
1378 $word->[1], $word->[2], $contextr, $contextr_orig,
1379 $config, "$cur_lineno", $args, 1);
1380 $ret = handle_unknown_word_action($action, $config, $args);
1381 }
1382 return $ret;
1383 }
1384
1385 # NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
1386 # If an ignore word is added which is attached to a word that should have a replacement
1387 # added and just that word is selected to ignore, you never get a chance to add a
1388 # replacement for the other word that it is attached to
1389
1390 # NOTE: This is very ugly code. The GUI code is the worst, but this whole part
1391 # of the program is nasty. This is partially due to the fact that features kept
1392 # being added when their use was discovered. This problem might be fixed in the
1393 # future when I have time to rewrite all of this.
1394
1395 # Handle unknown words
1396 # $substrings - the current substrings with unknown words
1397 # $config - the program config
1398 # $args - the command line args
1399 # $cur_lineno - display string to show the user the current line number
1400 # Returns:
1401 # 1 - the line needs to be re-transliterated
1402 # 0 - all done
1403 sub get_unknown_words {
1404 my ($substrings, $config, $args, $cur_lineno) = @_;
1405 foreach my $i (0 .. $#$substrings) {
1406 my $word = $substrings->[$i];
1407 if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
1408 my $ret = call_unknown_word_window $substrings, $i, $config, $args, $cur_lineno;
1409 # 3 means we ignore the line
1410 if ($ret == 3) {
1411 foreach my $s (@$substrings) {
1412 # revert all changes done on the line
1413 $s->[1] = $s->[2];
1414 }
1415 return 0;
1416 }
1417 # 1 means the line needs to be re-transliterated
1418 return 1 if $ret == 1;
1419 }
1420 $i++;
1421 }
1422 return 0;
1423 }
1424
1425 # Main replacement function
1426 # Opens the input file ($args->{"input"}) and writes the transliterated text
1427 # to the file handle $outputfh, prompting the user for unknown words or
1428 # word choices (if those aren't disabled on the command line)
1429 sub replace {
1430 my ($config, $args, $total_lines, $inputfh, $outputfh) = @_;
1431 while (my $line = <$inputfh>) {
1432 next if $. < $args->{"start"};
1433 my $comment;
1434 if (exists $config->{"comment"}) {
1435 $line =~ s/\Q$config->{comment}\E(.*)\z//s;
1436 $comment = $1;
1437 }
1438 my $nfd_line = NFD($line);
1439 my $substrings = replace_line($config, $args, $nfd_line);
1440
1441 if (!$args->{"nounknowns"}) {
1442 # re-transliterate the string if the config was reloaded
1443 while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
1444 $substrings = replace_line($config, $args, $nfd_line);
1445 }
1446 } elsif ($args->{"debugspecial"}) {
1447 foreach my $s (@$substrings) {
1448 if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
1449 warn "Unknown word: \"$s->[1]\"\n";
1450 }
1451 }
1452 }
1453 if (!$args->{"nochoices"}) {
1454 # this only loops more than once if the user presses the button
1455 # "Open in unknown word window"
1456 while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) {
1457 if ($ret == 1) {
1458 $substrings = replace_line($config, $args, $nfd_line);
1459 }
1460 }
1461 } elsif ($args->{"debugspecial"}) {
1462 foreach my $s (@$substrings) {
1463 if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
1464 my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
1465 warn "Word \"$s->[1]\" with $num_choices word choices.\n";
1466 }
1467 }
1468 }
1469
1470 foreach (@$substrings) {
1471 print $outputfh $_->[1];
1472 }
1473 print $outputfh $comment if $comment;
1474 }
1475 }
1476
1477 my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
1478 GetOptions(
1479 \%args, "debug", "debugspecial",
1480 "nochoices", "nounknowns",
1481 "force", "start=i",
1482 "output=s", "config=s",
1483 "errors=s", "help",
1484 "checkduplicates", "dumptables") or pod2usage(1);
1485
1486 pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
1487 pod2usage(-exitval => 1, -verbose => 1) if @ARGV > 1;
1488
1489 if (!-f $args{"config"}) {
1490 die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
1491 }
1492 my $config = load_config \%args;
1493 if (!$config) {
1494 die "ERROR: Invalid config\n";
1495 }
1496 exit 0 if ($args{"checkduplicates"} || $args{"dumptables"});
1497
1498 my $inputfh;
1499 my $total_lines = "UNKNOWN";
1500 if (@ARGV < 1) {
1501 warn "WARNING: no input file supplied; taking input from STDIN\n";
1502 $inputfh = \*STDIN;
1503 } else {
1504 open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
1505 # Is there *really* no more efficient way to get the total number of lines?
1506 $total_lines = 0;
1507 while (<$inputfh>) {$total_lines++};
1508 close $inputfh;
1509 open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
1510 }
1511
1512 if (-f $args{"errors"} && !$args{"force"}) {
1513 my $choice = "";
1514 while ($choice !~ /^[yn]$/) {
1515 print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
1516 $choice = <STDIN>;
1517 chomp $choice;
1518 }
1519 die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
1520 }
1521
1522 my $outputfh;
1523 if ($args{"output"} eq "") {
1524 warn "WARNING: no output file supplied; printing to STDOUT\n";
1525 $outputfh = \*STDOUT;
1526 } elsif (-f $args{"output"} && !$args{"force"}) {
1527 my $choice = "";
1528 while ($choice !~ /^[aoe]$/) {
1529 print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
1530 $choice = <STDIN>;
1531 chomp $choice;
1532 }
1533 if ($choice eq "a") {
1534 open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
1535 } elsif ($choice eq "e") {
1536 die "ERROR: \"$args{output}\" already exists.\n";
1537 } else {
1538 open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
1539 }
1540 } else {
1541 open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
1542 }
1543
1544 replace($config, \%args, $total_lines, $inputfh, $outputfh);
1545 close $inputfh;
1546 close $outputfh;
1547
1548 __END__
1549
1550 =head1 NAME
1551
1552 transliterate.pl - Transliterate text files
1553
1554 =head1 SYNOPSIS
1555
1556 transliterate.pl [options][input file]
1557
1558 Start the transliteration engine with the given file as input.
1559 The input file defaults to STDIN if no filename is given.
1560
1561 =head1 OPTIONS
1562
1563 =over 8
1564
1565 =item B<--output> <filename>
1566
1567 Sets the output file to print to.
1568
1569 If the file exists already and B<--force> is not set, the user is asked
1570 if the file should be overwritten or appended to.
1571
1572 B<Default:> C<STDOUT> (print to terminal)
1573
1574 =item B<--config> <filename>
1575
1576 Sets the configuration file to use.
1577
1578 B<Default:> C<config>
1579
1580 =item B<--checkduplicates>
1581
1582 Prints all duplicate words within single table files and across tables
1583 that are replaced within the same group, then exits the program.
1584
1585 Note that this simply prints B<all> duplicates, even ones that are
1586 legitimate. When duplicates are found during normal operation of
1587 the program, they are simply combined in exactly the same way as the
1588 regular word choices.
1589
1590 Also note that the words are still added as possible choices, which
1591 may be slightly confusing. If, for instance, a word "word" is stored
1592 in the tables "tablea", "tableb", and "tablec" with the replacements
1593 "a", "b", and "c", the first duplicate message will say that the
1594 first occurrence was in table "tablea" with the replacement "a", and
1595 the second duplicate message will say that the first occurrence was
1596 in table "tablea" with the replacement "a$b" (assuming $ is the
1597 value set as B<choicesep> in the config). This is just something to
1598 be aware of.
1599
1600 On that note, before duplicates are checked between tables in the
1601 same replacement group, duplicates inside the same file are already
1602 replaced, so that might be a bit confusing as well.
1603
1604 =item B<--dumptables>
1605
1606 Prints the words of all tables that don't have B<nodisplay> set.
1607
1608 This is mainly meant to be used for generating word lists in order to
1609 use them in a spell checker. Note that the words printed here are in
1610 UTF-8 NFC (Unicode Canonical Composition Form), so it may not be ideal
1611 when the spellchecked text is not in the same form.
1612
1613 =item B<--nochoices>
1614
1615 Disables prompting for the right word when multiple replacement words exist.
1616
1617 This can be used to "weed out" all the unknown words before
1618 commencing the laborious task of choosing the right word every time
1619 multiple options exist.
1620
1621 =item B<--nounknowns>
1622
1623 Disables prompting for the right word when a word is not found in the database.
1624
1625 This can be used together with B<--nochoices> to perform a quick test of how
1626 well the actual engine is working without having to click through all the
1627 prompts.
1628
1629 =item B<--debug>
1630
1631 Prints information helpful for debugging problems with the B<match> and B<group>
1632 statements.
1633
1634 For each B<match> or B<group> statement which replaces anything, the original
1635 statement is printed (the format is a bit different than in the config) and
1636 each actual word that's replaced is printed.
1637
1638 =item B<--debugspecial>
1639
1640 This option is only useful for automatic testing of the transliteration engine.
1641
1642 If B<--nochoices> is enabled, each word in the input with multiple choices will
1643 be output, along with the number of choices (can be used to test the proper
1644 functioning of B<choicesep> in the config file).
1645
1646 If B<--nounknowns> is enabled, each unknown word in the input is printed
1647 (can be used to test that the B<ignore> options are working correctly).
1648
1649 =item B<--force>
1650
1651 Always overwrites the output and error file without asking.
1652
1653 =item B<--start> <line number>
1654
1655 Starts at the given line number instead of the beginning of the file.
1656
1657 Note: when "Stop processing" is pressed, the current line number is
1658 printed out. This is the current line that was being processed, so it
1659 has not been printed to the output file yet and thus the program must
1660 be resumed at that line, not the one afterwards.
1661
1662 =item B<--errors> <filename>
1663
1664 Specifies a file to write errors in. Note that this does not refer to
1665 actual errors, but to any words that were temporarily ignored
1666 (i.e. words for which "Ignore: This run" was clicked).
1667
1668 If no file is specified, nothing is written. If a file is specified
1669 that already exists and B<--force> is not set, the user is prompted
1670 for action.
1671
1672 =item B<--help>
1673
1674 Displays the full documentation.
1675
1676 =back
1677
1678 =head1 DESCRIPTION
1679
1680 B<transliterate.pl> will read the given input file and transliterate it
1681 based on the given configuration file, prompting the user for action if
1682 a word has multiple replacement options or is not found in the database.
1683
1684 See L</"CONFIGURATION"> for details on what is possible.
1685
1686 Note that this is B<not> some sort of advanced transliteration engine
1687 which understands the grammar of the language and tries to guess words
1688 based on that. This is only a glorified find-and-replace program
1689 with some extra features to make it useful for transliterating text
1690 using large wordlists.
1691
1692 WARNING: All input data is assumed to be UTF-8!
1693
1694 =head1 WORD CHOICE WINDOW
1695
1696 The word choice window is opened any time one word has multiple replacement
1697 options and prompts the user to choose one.
1698
1699 For each word with multiple options, the user must choose the right option
1700 and then press "Accept changes" to finalize the transliteration of the
1701 current line. The button to accept changes is selected by default, so it
1702 is possible to just press enter instead of manually clicking it. Before the
1703 line is finalized, the user may press "Undo" to undo any changes on the
1704 current line.
1705
1706 "Skip word" just leaves it as is. This shouldn't be needed in most cases
1707 since B<choicesep> should always be set to a character that doesn't occur
1708 normally in the text anyways.
1709
1710 "Open in unknown word window" will open the
1711 L<unknown word window|/"UNKNOWN WORD WINDOW"> with the current word
1712 selected. This is meant as a helper if you notice that another word choice
1713 needs to be added.
1714
1715 Warning: This is very inconsistent and buggy! Since the unknown word window
1716 is just opened directly, it isn't modified to make more sense for this
1717 situation. Whenever "Add replacement" is pressed, the whole line is
1718 re-transliterated as usual, but the word choice window is opened again
1719 right afterwards. If you just want to go back to the word choice window,
1720 press the ignore button for "whole line" since that shouldn't break
1721 anything. There are weird inconsistencies, though - for instance, if you
1722 delete all words in the tables, then press "Reload config", the line will
1723 be re-transliterated and none of the words will actually be found, but it
1724 will still go on because control passes back to the word choice window no
1725 matter what. Also, none of the word choices that were already done on this
1726 line are saved since the line is restarted from the beginning. As I said,
1727 it's only there as a helper and is very buggy/inconsistent. Maybe I'll make
1728 everything work better in a future release.
1729
1730 "Stop processing" will exit the program and print the line number that was
1731 currently being processed.
1732
1733 =head1 UNKNOWN WORD WINDOW
1734
1735 The unknown word window is opened any time a word could not be replaced.
1736
1737 Both the context from the original script and the context from the
1738 transliterated version (so far) is shown. If a part of the text is
1739 selected in one of the text boxes and "Use selection as word" is
1740 pressed for the appropriate box, the selected text is used for the
1741 action that is taken subsequently. "Reset text" resets the text in
1742 the text box to its original state (except for the highlight because
1743 I'm too lazy to do that).
1744
1745 The possible actions are:
1746
1747 =over 8
1748
1749 =item Ignore
1750
1751 "This run" only ignores the word until the program exits, while
1752 "Permanently" saves the word in the ignore file specified in the
1753 configuration. "Whole line" stops asking for unknown words on
1754 this line and prints the line out as it originally was in the
1755 file. Note that any words in the original line that contain
1756 B<choicesep> will still cause the L<word choice window|/"WORD CHOICE WINDOW">
1757 to appear due to the way it is implemented. Just press "Skip word"
1758 if that happens.
1759
1760 =item Retry without <display name>
1761
1762 Removes all characters specified in the corresponding B<retrywithout>
1763 statement in the L<config|/"CONFIGURATION">
1764 from the currently selected word and re-transliterates just that
1765 word. The result is then pasted into the text box beside
1766 "Add replacement" so it can be added to a table. This is only a
1767 sort of helper for languages like Urdu in which words often can
1768 be written with or without diacritics. If the "base form" without
1769 diacritics is already in the tables, this button can be used to
1770 quickly find the transliteration instead of having to type it
1771 out again. Any part of the word that couldn't be transliterated
1772 is just pasted verbatim into the text box (but after the
1773 characters have been removed).
1774
1775 Note that the selection can still be modified after this, before
1776 pressing "Add to list". This could potentially be useful if a word
1777 is in a table that is expanded using "noroot" because for instance
1778 "Retry without diacritics" would only work with the full word (with
1779 the ending), but only the stem should be added to the list. If that
1780 is the case, "Retry without diacritics" could be pressed with the
1781 whole word selected, but the ending could be removed before actually
1782 pressing "Add to list".
1783
1784 A separate button is shown for every B<retrywithout> statement
1785 in the config.
1786
1787 =item Add to list
1788
1789 Adds the word typed in the text box beside "Add replacement" to the
1790 selected table file as the replacement for the word currently selected
1791 and re-runs the replacement on the current line. All table files that
1792 do not have B<nodisplay> set are shown as options, see L</"CONFIGURATION">.
1793
1794 Warning: This simply appends the word and its replacement to the end
1795 of the file, so it will cause an error if there was no newline
1796 ("\n") at the end of the file before.
1797
1798 Note that this always re-transliterates the entire line afterwards.
1799 This is to allow more flexibility. Consider, for instance, a compound
1800 word of which the first part is also a valid single word. If the
1801 entire line was not re-transliterated, it would be impossible to
1802 add a replacement for that entire compound word and have it take
1803 effect during the same run since the first part of the word would
1804 not even be available for transliteration anymore.
1805
1806 One problem is that the word is just written directly to the file
1807 and there is no undo. This is the way it currently is and will
1808 probably not change very soon. If a mistake is made, the word can
1809 always be removed again manually from the list and "Reload config"
1810 pressed.
1811
1812 =item Reload config
1813
1814 Reload the configuration file along with all tables an re-runs the
1815 replacement on the current line. Note that this can take a short
1816 while since the entire word database has to be reloaded.
1817
1818 =item Stop processing
1819
1820 Prints the current line number to the terminal and exits the program.
1821
1822 The program can always be started again at this line number using
1823 the B<--start> option if needed.
1824
1825 =back
1826
1827 =head1 INTERNALS/EXAMPLES
1828
1829 This section was added to explain to the user how the transliteration
1830 process works internally since that may be necessary to understand
1831 why certain words are replaced the way they are.
1832
1833 First off, the process works line-by-line, i.e. no B<match> or B<replace>
1834 statement will ever match anything that crosses the end of a line.
1835
1836 Each line is initially stored as one chunk which is marked as
1837 untransliterated. Then, all B<match>, B<matchignore>, and B<replace>
1838 (or, rather, B<group>) statements are executed in the order they
1839 appear in the config file. Whenever a word/match is replaced, it
1840 is split off into a separate chunk which is marked as transliterated.
1841 A chunk marked as transliterated I<is entirely ignored by any
1842 replacement statements that come afterwards>. Note that B<beginword>
1843 and B<endword> can always match at the boundary between an
1844 untransliterated and transliterated chunk. This is to facilitate
1845 automated replacement of certain grammatical constructions. For instance:
1846
1847 If the string "a-" could be attached as a prefix to any word and needed
1848 to be replaced as "b-" everywhere, it would be quite trivial to add
1849 a match statement C<'match "a-" "b-" beginword'>. If run on the text
1850 "a-word", where "word" is some word that should be transliterated
1851 as "word_replaced", and the group replace statement for the word comes
1852 after the match statement given above, the following would happen:
1853 First, the match statement would replace "a-" and split the text into
1854 the two chunks "b-" and "word", where "b-" is already marked as
1855 transliterated. Since "word" is now separate, it will be matched
1856 by the group replace statement later, even if it has B<beginword> set
1857 and would normally not match if "a-" came before it. Thus, the final
1858 output will be "b-word_replaced", allowing for the uniform replacement
1859 of the prefix instead of having to add each word twice, once with and
1860 once without the prefix.
1861
1862 In certain cases, this behavior may not be desired. Consider, for
1863 instance, a prefix "c-" which cannot be replaced uniformly as in the
1864 example above due to differences in the source and destination script.
1865 Since it cannot be replaced uniformly, two words "word1" and "word2"
1866 would both need to be specified separately with replacements for
1867 "c-word1" and "c-word2". If, however, the prefix "c-" has an
1868 alternate spelling "c " (without the hyphen), it would be very useful
1869 to be able to automatically recognize that as well. This is where the
1870 B<nofinal> attribute for the B<match> statements comes in. If there is
1871 a match statement C<'match "c " "c-" beginword nofinal'>, the replaced
1872 chunk is B<not> marked as transliterated, so after executing this
1873 statement on the text "c word1", there will still only be one chunk,
1874 "c-word1", allowing for the regular word replacements to function
1875 properly.
1876
1877 Once all the replacement statements have been processed, each chunk
1878 of text that is not marked as transliterated yet is split based on
1879 the B<split> pattern specified in the config and all actual characters
1880 matched by the B<split> pattern are marked as transliterated (this
1881 usually means all the spaces, newlines, quotation marks, etc.). Any
1882 remaining words/text chunks that are still marked as untransliterated are
1883 now processed by the unknown word window. If one of these remaining
1884 unknown chunks is present in the file specified by the B<ignore>
1885 statement in the config, it is simply ignored and later printed out
1886 as is. After all untransliterated words have either had a replacement
1887 added or been ignored, any words with multiple replacement choices are
1888 processed by the word choice window. Once this is all done, the final
1889 output is written to the output file and the process is repeated with
1890 the next line. Note that the entire process is started again each time
1891 a word is added to a table or the config is reloaded from the
1892 L<unknown word window|/"UNKNOWN WORD WINDOW">.
1893
1894 =head1 CONFIGURATION
1895
1896 These are the commands accepted in the configuration file.
1897 Any parameters in square brackets are optional.
1898 Comments are started with C<#>. Strings (filenames, regex strings, etc.)
1899 are enclosed in double quotes ("").
1900
1901 The B<match>, B<matchignore>, and B<replace> commands are executed in
1902 the order they are specified, except that all B<replace> commands within
1903 the same group are replaced together.
1904
1905 The B<match> and B<matchignore> statements accept any RegEx strings and
1906 are thus very powerful. The B<group> statements only work with the
1907 non-RegEx words from the tables, but are very efficient for large numbers
1908 of words and should thus be used for the main bulk of the words.
1909
1910 Any duplicate words found will cause the user to be prompted to choose
1911 one option every time the word is replaced in the input text.
1912
1913 Note that any regex strings specified in the config should B<not>
1914 contain capture groups, as that would break the B<endword> functionality
1915 since this is also implemented internally using capture groups. Capture
1916 groups are also entirely pointless in the config since they currently
1917 cannot be used as part of the replacement string in B<match> statements.
1918 Lookaheads and lookbehinds are fine, though, and could be useful in
1919 certain cases.
1920
1921 All tables must be loaded before they are used, or there will be an
1922 error that the table does not exist.
1923
1924 Warning: If a B<replace> statement is located before an B<expand>
1925 statement that would have impacted the table used, there will be no
1926 error but the expand statement won't have any impact.
1927
1928 Basic rule of thumb: Always put the B<table> statements before the
1929 B<expand> statements and the B<expand> statements before the B<replace>
1930 statements.
1931
1932 =over 8
1933
1934 =item B<split> <regex string>
1935
1936 Sets the RegEx string to be used for splitting words. This is only used
1937 for splitting the words which couldn't be replaced after all replacement
1938 has been done, before prompting the user for unknown words.
1939
1940 Note that B<split> should probably always contain at least C<\n>, since
1941 otherwise all of the newlines will be marked as unknown words. Usually,
1942 this will be included anyways through C<\s>.
1943
1944 Note also that B<split> should probably include the C<+> RegEx-quantifier
1945 since that allows the splitting function in the end to ignore several
1946 splitting characters right after each other (e.g. several spaces) in one
1947 go instead of splitting the string again for every single one of them.
1948 This shouldn't actually make any difference functionality-wise, though.
1949
1950 B<Default:> C<\s+> (all whitespace)
1951
1952 =item B<beforeword> <regex string>
1953
1954 Sets the RegEx string to be matched before a word if B<beginword> is set.
1955
1956 B<Default:> C<\s>
1957
1958 =item B<afterword> <regex string>
1959
1960 Sets the RegEx string to be matched after a word if B<endword> is set.
1961
1962 Note that B<afterword> should probably always contain at least C<\n>,
1963 since otherwise words with B<endword> set will not be matched at the
1964 end of a line.
1965
1966 B<beforeword> and B<afterword> will often be exactly the same, but
1967 they are left as separate options in case more fine-tuning is needed.
1968
1969 B<Default:> C<\s>
1970
1971 =item B<tablesep> <string>
1972
1973 Sets the separator used to split the lines in the table files into the
1974 original and replacement word.
1975
1976 B<Default:> C<Tab>
1977
1978 =item B<choicesep> <string>
1979
1980 Sets the separator used to split replacement words into multiple choices for
1981 prompting the user.
1982
1983 B<Default:> C<$>
1984
1985 =item B<comment> <string>
1986
1987 If enabled, anything after C<< <string> >> will be ignored on all lines in
1988 the input file. This will not be displayed in the
1989 L<unknown word window|/"UNKNOWN WORD WINDOW"> or L<word choice window|/"WORD CHOICE WINDOW">
1990 but will still be printed in the end, with the comment character removed
1991 (that seems to be the most sensible thing to do).
1992
1993 Note that this is really just a "dumb replacement", so there's no way to
1994 prevent a line with the comment character from being ignored. Just try
1995 to always set this to a character that does not occur anywhere in the text
1996 (or don't use the option at all).
1997
1998 =item B<ignore> <filename>
1999
2000 Sets the file of words to ignore.
2001
2002 This has to be set even if the file is just empty because the user can
2003 add words to it from the unknown word window.
2004
2005 =item B<table> <table identifier> <filename> [nodisplay] [revert]
2006
2007 Load the table from C<< <filename> >>, making it available for later use in the
2008 B<expand> and B<replace> commands using the identifier C<< <table identifier> >>.
2009
2010 if B<nodisplay> is set, the filename for this table is not shown in the
2011 L<unknown word window|/"UNKNOWN WORD WINDOW">. If, however, the same filename
2012 is loaded again for another table that does not have B<nodisplay> set, it is
2013 still displayed.
2014
2015 If B<revert> is set, the original and replacement words are switched. This can
2016 be useful for creating a config for transliterating in the opposite direction
2017 with the same database. I don't know why I called it "revert" since it should
2018 actually be called "reverse". I guess I was a bit confused.
2019
2020 Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
2021 to the location of the configuration file.
2022
2023 The table files simply consist of B<tablesep>-separated values, with the word in the
2024 original script first and the replacement word second. Both the original and
2025 replacement word can optionally have several parts separated by B<choicesep>. If the
2026 original word has multiple parts, it is separated and each of the parts is added
2027 to the table with the replacement. If the replacement has multiple parts, the user
2028 will be prompted to choose one of the options during the transliteration process.
2029 If the same word occurs multiple times in the same table with different replacements,
2030 the replacements are automatically added as choices that will be handled by the
2031 L<word choice window|/"WORD CHOICE WINDOW">.
2032
2033 If, for whatever reason, the same table is needed twice, but with different endings,
2034 the table can simply be loaded twice with different IDs. If the same path is loaded,
2035 the table that has already been loaded will be reused. Note that this feature was
2036 added before adding B<revert>, so the old table is used even if it had B<revert>
2037 set and the new one doesn't. This is technically a problem, but I don't know of
2038 any real-world case where it would be a problem, so I'm too lazy to change it.
2039 Tell me if it actually becomes a problem for you.
2040
2041 WARNING: Don't load the same table file both with and without B<revert> in the same
2042 config! When a replacement word is added through the GUI, the program has to know
2043 which way to write the words. Currently, whenever a table file is loaded with
2044 B<revert> anywhere in the config (even if it is loaded without B<revert> in a
2045 different place), words will automatically be written as if B<revert> was on. I
2046 cannot currently think of any reason why someone would want to load a file both
2047 with and without B<revert> in the same config, but I still wanted to add this
2048 warning just in case.
2049
2050 =item B<expand> <table identifier> <word ending table> [noroot]
2051
2052 Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
2053 the word endings in C<< <word ending table> >>, saving the result as a table with the
2054 identifier C<< <new table identifier> >>.
2055
2056 Note: There used to be a C<< <new table identifier> >> argument to create a new
2057 table in case one table had to be expanded with different endings. This has been
2058 removed because it was a bit ugly, especially since there wasn't a proper mapping
2059 from table IDs to filenames anymore. If this functionality is needed, the same table
2060 file can simply be loaded multiple times. See the B<table> section above.
2061
2062 If B<noroot> is set, the root forms of the words are not kept.
2063
2064 If the replacement for a word ending contains B<choicesep>, it is split and each part
2065 is combined with the root form separately and the user is prompted to choose one of
2066 the options later. it is thus possible to allow multiple choices for the ending if
2067 there is a distinction in the replacement script but not in the source script.
2068 Note that each of the root words is also split into its choices (if necessary)
2069 during the expanding, so it is possible to use B<choicesep> in both the endings
2070 and root words.
2071
2072 =item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
2073
2074 Perform a RegEx match using the given C<< <regex string> >>, replacing it with
2075 C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
2076 (e.g. groups) in it. B<beginword> and B<endword> specify whether the match must
2077 be at the beginning or ending of a word, respectively, using the RegEx specified
2078 in B<beforeword> and B<afterword>. If B<nofinal> is set, the string is not marked
2079 as transliterated after the replacement, allowing it to be modified by subsequent
2080 B<match> or B<replace> commands.
2081
2082 =item B<matchignore> <regex string> [beginword] [endword]
2083
2084 Performs a RegEx match in the same manner as B<match>, except that the original
2085 match is used as the replacement instead of specifying a replacement string, i.e.
2086 whatever is matched is just marked as transliterated without changing it.
2087
2088 =item B<group> [beginword] [endword]
2089
2090 Begins a replacement group. All B<replace> commands must occur between B<group>
2091 and B<endgroup>, since they are then grouped together and replaced in one go.
2092 B<beginword> and B<endword> act in the same way as specified for B<match> and
2093 apply to all B<replace> statements in this group.
2094
2095 =item B<replace> <table identifier> [override]
2096
2097 Replace all words in the table with the identifier C<< <table identifier> >>,
2098 using the B<beginword> and B<endword> settings specified by the current group.
2099
2100 Unless B<override> is set on the latter table, if the same word occurs in two
2101 tables with different replacements, both are automatically added as choices.
2102 See L</"WORD CHOICE WINDOW">.
2103
2104 B<override> can be useful if the same database is used for both directions and
2105 one direction maps multiple words to one word, but in the other direction this
2106 word should always default to one of the choices. In that case, a small table
2107 with these special cases can be created and put at the end of the main B<group>
2108 statement with B<override> set. This is technically redundant since you could
2109 just add a special group with only the override table in it earlier in the
2110 config, but it somehow seems cleaner this way.
2111
2112 Note that a table must have been loaded before being used in a B<replace> statement.
2113
2114 =item B<endgroup>
2115
2116 End a replacement group.
2117
2118 =item B<retrywithout> <display name> [character] [...]
2119
2120 Adds a button to the L<unknown word window|/"UNKNOWN WORD WINDOW"> to retry the
2121 replacements on the selected word, first removing the given characters.
2122 The button is named "<display name>" and located after the "Retry without" label.
2123 Whatever is found with the replacements is pasted into the regular text box for
2124 the "Add replacement" functionality.
2125
2126 This can be used as an aid when, for instance, words can be written with or without
2127 certain diacritics. If the actual word without diacritics is already in the
2128 database and there is a B<retrywithout> statement for all the diacritics, the
2129 button can be used to quickly find the replacement for the word instead of having
2130 to type it out manually. The same goes for compound words that can be written
2131 with or without a space.
2132
2133 It is also possible to specify B<retrywithout> without any characters, which just
2134 adds a button that takes whatever word is selected and retries the replacements
2135 on it. This can be useful if you want to manually edit words and quickly see if
2136 they are found with the edits in place.
2137
2138 Note that all input text is first normalized to the unicode canonical decomposition
2139 form so that diacritics can be removed individually.
2140
2141 Also note that all buttons are currently just dumped in the GUI without any
2142 sort of wrapping, so they'll run off the screen if there are too many.
2143 Tell me if this becomes a problem. I'm just too lazy to change it right now.
2144
2145 Small warning: This only removes the given characters from the word selected in
2146 the GUI, not from the tables. Thus, this only works if the version of the word
2147 without any of the characters is already present in the tables. It would be useful
2148 when handling diacritics if the program could simply make a comparison while
2149 completely ignoring diacritics, but I haven't figured out a nice way to implement
2150 that yet.
2151
2152 Historical note: This was called B<diacritics> in a previous version and only
2153 allowed removal of diacritics. This is exactly the same functionality, just
2154 generalized to allow removal of any characters with different buttons.
2155
2156 =item B<targetdiacritics> <diacritic> [...]
2157
2158 This was only added to simplify transliteration from Hindi to Urdu with the
2159 same database. When this is set, the choices in the
2160 L<word choice window|/"WORD CHOICE WINDOW"> are sorted in descending order
2161 based on the number of diacritics from this list that are matched in each
2162 choice. This is so that when transliterating from Hindi to Urdu, the choice
2163 with the most diacritics is always at the top.
2164
2165 Additionally, if there are I<exactly> two choices for a word and one of
2166 them contains diacritics but the other one doesn't, the one containing
2167 diacritics is automatically taken without ever prompting the user. This
2168 is, admittedly, a very language-specific feature, but I couldn't think of
2169 a simple way of adding it without building it directly into the actual program.
2170
2171 Note that due to the way this is implemented, it will not take any effect
2172 if B<--nochoices> is enabled.
2173
2174 The attentive reader will notice at this point that most of the features
2175 in this program were added specifically for dealing with Urdu and Hindi,
2176 which does appear to make sense, considering that this program was written
2177 specifically for transliterating Urdu to Hindi and vice versa (although
2178 not quite as much vice versa).
2179
2180 =back
2181
2182 =head1 BUGS
2183
2184 Although it may not seem like it, one of the ugliest parts of the program is the
2185 GUI functionality that allows the user to add a replacement word. The problem is
2186 that all information about the B<expand> and B<replace> statements has to be kept
2187 in order to properly handle adding a word to one of the files and simultaneously
2188 adding it to the currently loaded tables I<without reloading the entire config>.
2189 The way it currently works, the replacement word is directly written to the file,
2190 then all B<expand> statements that would have impacted the words from this file
2191 are redone (just for the newly added word) and the resulting words are added to
2192 the appropriate tables (or, technically, the appropriate 'trie'). Since a file
2193 can be mapped to multiple table IDs and a table ID can occur in multiple replace
2194 statements, this is more complicated than it sounds, and thus it is very likely
2195 that there are bugs lurking here somewhere. Do note that "Reload config" will
2196 B<always> reload the entire configuration, so that's safe to do even if the
2197 on-the-fly replacing doesn't work.
2198
2199 In general, I have tested the GUI code much less than the rest since you can't
2200 really test it automatically very well.
2201
2202 The code is generally quite nasty, especially the parts belonging to the GUI.
2203 Don't look at it.
2204
2205 Tell me if you find any bugs.
2206
2207 =head1 SEE ALSO
2208
2209 perlre, perlretut
2210
2211 =head1 LICENSE
2212
2213 Copyright (c) 2019, 2020, 2021 lumidify <nobody[at]lumidify.org>
2214
2215 Permission to use, copy, modify, and/or distribute this software for any
2216 purpose with or without fee is hereby granted, provided that the above
2217 copyright notice and this permission notice appear in all copies.
2218
2219 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
2220 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
2221 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
2222 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
2223 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
2224 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
2225 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
2226
2227 =cut