diff --git a/lib/FDS.pm b/lib/FDS.pm index 81fc9faa51dd045a3a4323ad4689d83fbd6a6504..5f441f6ca4f8b78ec0074ef05a7c35cbbc757be1 100644 --- a/lib/FDS.pm +++ b/lib/FDS.pm @@ -75,7 +75,7 @@ sub open my $fo_fnm= sprintf ($self->{out_pattern} . $self->{out_extension}, ++$self->{_count}); local *FO_RECODED; - if ($self->{'compress'} == 1) + if ($self->{compress} == 1) { open (FO_RECODED, '|-', "gzip -c >'$fo_fnm'") or die "can't write to [$fo_fnm]"; } @@ -116,7 +116,10 @@ sub print if ($self->{compress} == 2) { # binmode (FO, ':raw'); - $px= print FO compress($l); + utf8::encode($l); + my $compressed= compress($l); + # print __LINE__, " compressed=[$compressed]\n"; + $px= print FO $compressed; } else { diff --git a/lib/Wiktionary/Text.pm b/lib/Wiktionary/Text.pm new file mode 100644 index 0000000000000000000000000000000000000000..6b9f229a18c1f97231aab7f0c78ed307234f8b94 --- /dev/null +++ b/lib/Wiktionary/Text.pm @@ -0,0 +1,123 @@ + +package Wiktionary::Text; + +use Data::Dumper; + +sub analyze_wiki_text +{ + my $lines= shift; + + my $type= 'unknown'; + my $language= 'unknown'; + + return ('empty', $language, []) unless (@$lines); + + # print __LINE__, " analyze_wiki_text: ", Dumper($lines); + + my @errors; + + TEXT: foreach my $l (@$lines) + { + print __LINE__, " [$l]\n"; + + if ($l =~ m#^=#) + { + my @tokens= split(' ', $l); + + my $hl_o= shift(@tokens); + my $hl_c= pop(@tokens); + + if ($hl_o ne $hl_c) + { + push (@errors, ['heading mismatch', $l, "hl_o=[$hl_o] hl_c=[$hl_c]", \@tokens]); + next TEXT; + } + + my $hl= length($hl_o); + print __LINE__, " heading level=[$hl] tokens: ", Dumper(\@tokens); + + my ($words, $macro_infos)= analyze_heading_tokens(@tokens); + print __LINE__, " words: ", Dumper($words); + print __LINE__, " macro_infos: ", Dumper($macro_infos); + } + } + + if (@errors) + { + print __LINE__, " errors: ", Dumper(\@errors); + } + + return ($type, $language, \@errors); +} + +sub analyze_heading_tokens +{ + my @tokens= @_; + + my @words= (); + my @macro_infos= (); + + while (my $token= shift(@tokens)) + { + if ($token=~ m#^\(?\{\{(.+)}}\)?#) + { + my $macro= $1; + push (@macro_infos, process_macro($macro)); + } + elsif ($token =~ m#^\(?\{\{(.+)#) + { + my $macro= $1; + + T2: while (my $t2= shift(@tokens)) # find the end of the macro + { + if ($t2 =~ m#(.+)}}\)?,?$#) # there could be several macros, separated by , + { + $macro .= ' ' . $1; + last T2; + } + else + { + $macro .= ' '. $t2; + } + } + + print __LINE__, " macro=[$macro]\n"; + + push (@macro_infos, process_macro($macro)); + } + else + { + push (@words, $token); + } + } + + print __LINE__, " words: ", Dumper(\@words); + print __LINE__, " macro_infos: ", Dumper(\@macro_infos); + + (\@words, \@macro_infos); +} + +sub process_macro +{ + my $macro_string= shift; + + my @elements= split (/\|/, $macro_string); + print __LINE__, " elements: ", Dumper(\@elements); + + \@elements; +} + + +1; + +__END__ + +=head1 NOTES + +=head2 heading level 2 + + format: == string ({{language_label|language}}) == + +there can be several sections for the same title representing several languages + + diff --git a/wkt1.pl b/wkt1.pl index 9f9473b551ce9cb6ff8d924f76dba02dc343822c..20fc0b5de3c0d5303ee5ea1634e55303a31e3035 100755 --- a/wkt1.pl +++ b/wkt1.pl @@ -3,7 +3,6 @@ use strict; use JSON; -use FileHandle; use Util::JSON; use Util::Simple_CSV; @@ -11,9 +10,16 @@ use Util::Simple_CSV; use Data::Dumper; $Data::Dumper::Indent= 1; +use FileHandle; + +binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; +binmode( STDERR, ':utf8' ); autoflush STDERR 1; +binmode( STDIN, ':utf8' ); + use lib 'lib'; use wkutils; use Wiktionary::Utils; +use Wiktionary::Text; use FDS; @@ -34,8 +40,6 @@ my $fo_compress= 2; # 1..compress output stream by piping into gzip; DO NOT USE # 2..compress individual records using Compress::Zlib::compress() -binmode (STDOUT, ':utf8'); - my @PARS= (); while (my $arg= shift (@ARGV)) { @@ -94,6 +98,8 @@ EOX analyze_wiktionary_dump ($fnm); +my $ts_stop= localtime (time()); + exit(0); sub analyze_wiktionary_dump @@ -112,6 +118,7 @@ sub analyze_wiktionary_dump print "mkdir $data_dir\n"; mkdir ($data_dir); } + unless (-d $out_dir) { print "mkdir $out_dir\n"; @@ -127,7 +134,7 @@ sub analyze_wiktionary_dump print FO_ITEMS join ($TSV_SEP, @cols1), "\n"; autoflush FO_ITEMS 1; - my $fo_rec= new FDS('out_pattern' => "${out_dir}/wkt%05d"); + my $fo_rec= new FDS('out_pattern' => "${out_dir}/wkt%05d", compress => $fo_compress); # $fo_rec->set (compress => 0, out_extension => ''); my $fo_count= $fo_rec->open(); my $fo_pos= 0; @@ -140,7 +147,7 @@ sub analyze_wiktionary_dump my %frame; my @text; my $cnt_ATTN= 0; - my $debug_item= 0; + my @debug_item= (); LINE: while (1) { $pos= tell(FI); @@ -203,6 +210,10 @@ sub analyze_wiktionary_dump { $state= 1; } + elsif ($l =~ m#^\s*<text xml:space="preserve" */>#) # NOTE: empty text + { + $state= 1; + } elsif ($l =~ m#^\s*<text xml:space="preserve">(.*)#) # TODO: check for other <text> tags { my $t= $1; @@ -212,9 +223,10 @@ sub analyze_wiktionary_dump } elsif ($l =~ m#^\s*<text(.*)>#) # TODO: check for other <text> tags { - print "ATTN: strange text-tag: [$l] title=[$frame{title}]\n"; + my $msg= "ATTN: strange text-tag: [$l] title=[$frame{title}]"; + print $msg, "\n"; $cnt_ATTN++; - $debug_item= 1; + push (@debug_item, $msg); } elsif ($l =~ m#^\s*<(id|sha1)>([^<]+)</.+>#) { @@ -237,19 +249,37 @@ sub analyze_wiktionary_dump if ($flush) { - $fo_rec->print (join ("\n", @lines)); + my $frame= join ("\n", @lines); + # utf8::encode($frame); + $fo_rec->print ($frame); $frame{fo_pos_end}= $fo_rec->tell(); - if ($debug > 1 || $debug_item) + if ($debug > 1 || @debug_item) { print "="x72, "\n"; + if (@debug_item) + { + print __LINE__, " debug_item reasons:\n"; + foreach my $msg (@debug_item) + { + print __LINE__, " * reason=[", $msg, "]:\n"; + } + } + print __LINE__, " frame: ", Dumper(\%frame); print __LINE__, " text: ", Dumper(\@text); print __LINE__, " lines: ", Dumper (\@lines); print "="x72, "\n"; - $debug_item= 0; + @debug_item= (); + } + + # process wiki text + if ($seq eq 'b') + { + print __LINE__, " id=[", $frame{id}, "] title=[", $frame{title}, "]\n"; + Wiktionary::Text::analyze_wiki_text(\@text); } print FO_ITEMS join ($TSV_SEP, map { $frame{$_} } @cols1), "\n"; @@ -277,3 +307,4 @@ sub analyze_wiktionary_dump 1; } +