diff --git a/wdq1.pl b/wdq1.pl index 1ccc219952652c0d0907109586a5129fbbce3d13..7a43fe8b909c12bd5f763d92102259d72ca79064 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -354,7 +354,7 @@ my $fo_count= $fo_rec->open(); # print "[$line] [$pos] [$l]\n"; my $j; - eval { $j= decode_json ($l); }; + eval { $j= decode_json ($l); }; if ($@) { print "[$line] [$pos] ERROR=[", $@, "]\n"; @@ -363,286 +363,294 @@ my $fo_count= $fo_rec->open(); next LINE; } - my ($id, $ty)= map { $j->{$_} } qw(id type); - my $id_num; + my ($id, $ty)= map { $j->{$_} } qw(id type); + my $id_num; - if ($id =~ m#^P(\d+)$#) - { - $id_num= undef; - } - elsif ($id =~ m#^Q(\d+)$#) - { - $id_num= $1; - $max_id= $id_num if ($id_num > $max_id); - } - else - { - print "WARNING: id=[$id]: format incorrect\n"; - next LINE; - } - - $types{$ty}++; + if ($id =~ m#^P(\d+)$#) + { + $id_num= undef; + } + elsif ($id =~ m#^Q(\d+)$#) + { + $id_num= $1; + $max_id= $id_num if ($id_num > $max_id); + } + else + { + print "WARNING: id=[$id]: format incorrect\n"; + next LINE; + } - if ($ty eq 'property') - { - # $pos= tell(FI); - push (@{$props{$id}}, $j); - next LINE; - } + $types{$ty}++; - if ($ty ne 'item' || !defined ($id_num)) - { - print "[$line] [$pos] unknown type=[$ty]\n"; - print DIAG "[$line] [$pos] type=[$ty] line=[$line]\n"; - # $pos= tell(FI); - next LINE; - } + if ($ty eq 'property') + { + # $pos= tell(FI); + push (@{$props{$id}}, $j); + next LINE; + } - # my $py= substr($l, 0, 30) . '...' . substr ($l, -30); - my $px= $fo_rec->print($l); - my $fo_pos_end= $fo_rec->tell(); - # print "px=[$px] l=[$py]\n"; + if ($ty ne 'item' || !defined ($id_num)) + { + print "[$line] [$pos] unknown type=[$ty]\n"; + print DIAG "[$line] [$pos] type=[$ty] line=[$line]\n"; + # $pos= tell(FI); + next LINE; + } - foreach my $a (keys %$j) { $attrs{$a}++; } + # my $py= substr($l, 0, 30) . '...' . substr ($l, -30); + my $px= $fo_rec->print($l); + my $fo_pos_end= $fo_rec->tell(); + # print "px=[$px] l=[$py]\n"; - # grip and counts labels and descriptions - my ($jl, $jd, $ja, $jc, $js)= map { $j->{$_} } @item_attrs; + foreach my $a (keys %$j) { $attrs{$a}++; } - my $c_jl= counter ($jl, \%lang_labels); - my $c_jd= counter ($jd, \%lang_descr); - my $c_ja= counter ($ja, \%lang_aliases); - my $c_jc= counter ($jc, \%prop_claims); - my $c_js= counter ($js, \%name_sitelinks); + # grip and counts labels and descriptions + my ($jl, $jd, $ja, $jc, $js)= map { $j->{$_} } @item_attrs; - # language translations - my (%tlt_l, %tlt_d); - my ($pref_l, $lang_l); - foreach my $lang (@langs) - { - my $label= $jl->{$lang}->{'value'}; - my $desc= $jd->{$lang}->{'value'}; - $tlt_l{$lang}= $label; - $tlt_d{$lang}= $label; + my $c_jl= counter ($jl, \%lang_labels); + my $c_jd= counter ($jd, \%lang_descr); + my $c_ja= counter ($ja, \%lang_aliases); + my $c_jc= counter ($jc, \%prop_claims); + my $c_js= counter ($js, \%name_sitelinks); - unless (defined ($pref_l)) + # language translations + my (%tlt_l, %tlt_d); + my ($pref_l, $lang_l); + foreach my $lang (@langs) { - $pref_l= $label; - $lang_l= $lang; + my $label= $jl->{$lang}->{'value'}; + my $desc= $jd->{$lang}->{'value'}; + $tlt_l{$lang}= $label; + $tlt_d{$lang}= $label; + + unless (defined ($pref_l)) + { + $pref_l= $label; + $lang_l= $lang; + } } - } - # print "tlt_l: ", Dumper (\%tlt_l); - # print "tlt_d: ", Dumper (\%tlt_d); + # print "tlt_l: ", Dumper (\%tlt_l); + # print "tlt_d: ", Dumper (\%tlt_d); - # claims -> properties - my @all_properties= sort keys %$jc; + # claims -> properties + my @all_properties= sort keys %$jc; - # properties filtered - my @found_properties= (); - my @bm_row=(); for (my $i= 0; $i <= $max_prop; $i++) { $bm_row[$i]='.' } + # properties filtered + my @found_properties= (); + my @bm_row=(); for (my $i= 0; $i <= $max_prop; $i++) { $bm_row[$i]='.' } - # Authority Control - my $authctrl; - if ($ty eq 'item') - { - my $use_authctrl= 0; - foreach my $x (@authctrl) + # Authority Control + my $authctrl; + if ($ty eq 'item') { - if (exists ($jc->{$x})) + my $use_authctrl= 0; + foreach my $x (@authctrl) { - $use_authctrl= 1; - last; + if (exists ($jc->{$x})) + { + $use_authctrl= 1; + last; + } } - } - if (!$use_authctrl && exists ($jc->{P31})) - { - my $P31= $jc->{P31}; - my $P31val= $P31->[0]->{mainsnak}->{datavalue}->{value}->{id}; - # print __LINE__, " P31=[$P31] => [$P31val]\n"; - $use_authctrl= 1 if ($P31val eq 'Q5'); - } + if (!$use_authctrl && exists ($jc->{P31})) + { + my $P31= $jc->{P31}; + my $P31val= $P31->[0]->{mainsnak}->{datavalue}->{value}->{id}; + # print __LINE__, " P31=[$P31] => [$P31val]\n"; + $use_authctrl= 1 if ($P31val eq 'Q5'); + } - if ($use_authctrl) - { - $authctrl= - { - 'id' => $id, - 'tlt_l' => \%tlt_l, - 'tlt_d' => \%tlt_d, - }; + if ($use_authctrl) + { + $authctrl= + { + 'id' => $id, + 'tlt_l' => \%tlt_l, + 'tlt_d' => \%tlt_d, + # P31 => $P31, + }; + } } - } - # foreach my $property (@filters) - PROP: foreach my $property (@all_properties) - { - my $prop_num; - if ($property =~ m#^P(\d+)$#) + # foreach my $property (@filters) + PROP: foreach my $property (@all_properties) { - $prop_num= $1; - $max_prop= $prop_num if ($prop_num > $max_prop); - } - else - { - print "WARNING: property=[$property]: format incorrect\n"; - next PROP; - } - $id_prop[$id_num]->[$prop_num]++ if ($exp_bitmap == 1); - $bm_row[$prop_num]='#' if ($exp_bitmap == 2); - - my $p= $jc->{$property}; - # print "p: ", Dumper ($p); - - my $x; - eval { $x= $p->[0]->{'mainsnak'}->{'datavalue'}->{'value'} }; - - # print "x: ", Dumper ($x); # exit; - - if ($@) + my $prop_num; + if ($property =~ m#^P(\d+)$#) { - print DIAG "id=$id error: property=[$property] $x=[$x] e=[$@] property=", Dumper ($p); - next PROP; + $prop_num= $1; + $max_prop= $prop_num if ($prop_num > $max_prop); } - elsif (!defined ($x)) + else { - print DIAG "id=$id undef x: property=[$property] property=", Dumper ($p); + print "WARNING: property=[$property]: format incorrect\n"; next PROP; } + $id_prop[$id_num]->[$prop_num]++ if ($exp_bitmap == 1); + $bm_row[$prop_num]='#' if ($exp_bitmap == 2); - my $y; - if (exists ($filters{$property})) - { - my $fp= $filters{$property}; - # print "fp: ", Dumper ($fp); - - # ZZZ - push (@found_properties, $property); - - $y= $fp->extract($x); - - local *FO_p= $fp->{'_FO'}; - print FO_p join ($TSV_SEP, - $line, $pos, $fo_count, $fo_pos, $fo_pos_end, - $id, $ty, - $c_jl, $c_jd, $c_ja, $c_jc, $c_js, # counters - $lang_l, $pref_l, - $y, - ), "\n"; - } - else - { - $y= WikiData::Property::Filter::_extract ($x, (ref($x) eq 'HASH') ? 1 : 0); - } + my $p= $jc->{$property}; + # print "p: ", Dumper ($p); - if (defined ($authctrl)) - { # collect all filtered properties for the authority record - $authctrl->{$property}= $y; - } - } + my $x; + eval { $x= $p->[0]->{'mainsnak'}->{'datavalue'}->{'value'} }; -# TODO: count claims, aliases, sitelinks, etc. + # print "x: ", Dumper ($x); # exit; + + if ($@) + { + print DIAG "id=$id error: property=[$property] $x=[$x] e=[$@] property=", Dumper ($p); + next PROP; + } + elsif (!defined ($x)) + { + print DIAG "id=$id undef x: property=[$property] property=", Dumper ($p); + next PROP; + } - # print "[$line] [$pos] ", Dumper ($j) if ($ty eq 'property'); - print FO_ITEMS join ($TSV_SEP, + my $y; + if (exists ($filters{$property})) + { + my $fp= $filters{$property}; + # print "fp: ", Dumper ($fp); + + # ZZZ + push (@found_properties, $property); + + $y= $fp->extract($x); + + local *FO_p= $fp->{'_FO'}; + print FO_p join ($TSV_SEP, $line, $pos, $fo_count, $fo_pos, $fo_pos_end, $id, $ty, $c_jl, $c_jd, $c_ja, $c_jc, $c_js, # counters $lang_l, $pref_l, - join (',', @found_properties), - join (',', @all_properties), - ), - "\n"; + $y, + ), "\n"; + } + else + { + $y= WikiData::Property::Filter::_extract ($x, (ref($x) eq 'HASH') ? 1 : 0); + } - printf BM_FILE ("%09d\t", $id_num); - print BM_FILE join ('', @bm_row); - print BM_FILE "\n"; + if (defined ($authctrl)) + { # collect all filtered properties for the authority record + $authctrl->{$property}= $y; + } + } - if (defined ($authctrl)) - { - print FO_AUTHCTRL ",\n" if ($cnt_authctrl); - print FO_AUTHCTRL encode_json($authctrl); - $cnt_authctrl++; - printf ("%8ld authority control records\n", $cnt_authctrl) if (($cnt_authctrl % 1000) == 0); - } + # TODO: count claims, aliases, sitelinks, etc. - last if (defined ($MAX_INPUT_LINES) && $line >= $MAX_INPUT_LINES); ### DEBUG - # $pos= tell(FI); -} + # print "[$line] [$pos] ", Dumper ($j) if ($ty eq 'property'); + print FO_ITEMS join ($TSV_SEP, + $line, $pos, $fo_count, $fo_pos, $fo_pos_end, + $id, $ty, + $c_jl, $c_jd, $c_ja, $c_jc, $c_js, # counters + $lang_l, $pref_l, + join (',', @found_properties), + join (',', @all_properties), + ), + "\n"; + + printf BM_FILE ("%09d\t", $id_num); + print BM_FILE join ('', @bm_row); + print BM_FILE "\n"; -close (FI); -$fo_rec->close(); + if (defined ($authctrl)) + { + print FO_AUTHCTRL ",\n" if ($cnt_authctrl); + print FO_AUTHCTRL encode_json($authctrl); + $cnt_authctrl++; + printf ("%9ld authority control records\n", $cnt_authctrl) if (($cnt_authctrl % 1000) == 0); + } -print "$cnt_authctrl authority records written to $fnm_authctrl\n"; -print FO_AUTHCTRL "\n]\n"; -close (FO_AUTHCTRL); + last if (defined ($MAX_INPUT_LINES) && $line >= $MAX_INPUT_LINES); ### DEBUG + # $pos= tell(FI); + } -# check if there are multiple definitions of the same property and flatten the structure a bit -open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die; -print PROPS_LIST join ($TSV_SEP, qw(prop def_cnt use_cnt datatype label_en descr_en)), "\n"; + close (FI); + $fo_rec->close(); -my @prop_ids= sort { $a <=> $b } map { ($_ =~ m#^P(\d+)$#) ? $1 : undef } keys %props; + print "$cnt_authctrl authority records written to $fnm_authctrl\n"; + print FO_AUTHCTRL "\n]\n"; + close (FO_AUTHCTRL); -foreach my $prop_num (@prop_ids) -# foreach my $prop_num (sort keys %props) -{ - my $prop_id= 'P'.$prop_num; - my @prop= @{$props{$prop_id}}; - my $p0= $prop[0]; - if (@prop != 1) # each property needs to be defined exactly once + # check if there are multiple definitions of the same property and flatten the structure a bit + open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die; + print PROPS_LIST join ($TSV_SEP, qw(prop def_cnt use_cnt datatype label_en descr_en)), "\n"; + + my @prop_ids= sort { $a <=> $b } map { ($_ =~ m#^P(\d+)$#) ? $1 : undef } keys %props; + + foreach my $prop_num (@prop_ids) + # foreach my $prop_num (sort keys %props) { - print "ATTN: prop=[$prop_num] count=",(scalar @prop), "\n"; + my $prop_id= 'P'.$prop_num; + my @prop= @{$props{$prop_id}}; + my $p0= $prop[0]; + if (@prop != 1) # each property needs to be defined exactly once + { + print "ATTN: prop=[$prop_num] count=",(scalar @prop), "\n"; + } + else + { + $props{$prop_num}= $p0; + } + + my $dt= $p0->{'datatype'}; + my $l_en= $p0->{'labels'}->{'en'}->{'value'}; + my $d_en= $p0->{'descriptions'}->{'en'}->{'value'}; + print PROPS_LIST join ($TSV_SEP, $prop_id, (scalar @prop), $prop_claims{$prop_id}, $dt, $l_en, $d_en), "\n"; } - else + close (PROPS_LIST); + + if (open (PROPS, '>:utf8', $data_dir . '/props.json')) { - $props{$prop_num}= $p0; + print PROPS encode_json (\%props); + close (PROPS); } - my $dt= $p0->{'datatype'}; - my $l_en= $p0->{'labels'}->{'en'}->{'value'}; - my $d_en= $p0->{'descriptions'}->{'en'}->{'value'}; - print PROPS_LIST join ($TSV_SEP, $prop_id, (scalar @prop), $prop_claims{$prop_id}, $dt, $l_en, $d_en), "\n"; -} -close (PROPS_LIST); - -open (PROPS, '>:utf8', $data_dir . '/props.json') or die; -print PROPS encode_json (\%props); -close (PROPS); - -print "pos: $pos\n"; -print "types: ", Dumper (\%types); -print "attrs: ", Dumper (\%attrs); -print "lang_labels: ", Dumper (\%lang_labels); -print "lang_descr: ", Dumper (\%lang_descr); -print "lang_aliases: ", Dumper (\%lang_aliases); -print "name_sitelinks: ", Dumper (\%name_sitelinks); -print "prop_claims: ", Dumper (\%prop_claims); - -print "max_id: $max_id\n"; -print "max_prop: $max_prop\n"; -print "lines: $line\n"; -print "fo_count: $fo_count\n"; -print "cnt_authctrl: $cnt_authctrl\n"; + my $stats_opened= 0; + if (open (STATS, '>:utf8', $data_dir . '/conversion-stats.log')) + { + $stats_opened= 1; + print STATS "pos: $pos\n"; + print STATS "types: ", Dumper (\%types); + print STATS "attrs: ", Dumper (\%attrs); + print STATS "lang_labels: ", Dumper (\%lang_labels); + print STATS "lang_descr: ", Dumper (\%lang_descr); + print STATS "lang_aliases: ", Dumper (\%lang_aliases); + print STATS "name_sitelinks: ", Dumper (\%name_sitelinks); + print STATS "prop_claims: ", Dumper (\%prop_claims); + + print STATS "max_id: $max_id\n"; + print STATS "max_prop: $max_prop\n"; + print STATS "lines: $line\n"; + print STATS "fo_count: $fo_count\n"; + print STATS "cnt_authctrl: $cnt_authctrl\n"; + } if ($exp_bitmap == 1) { - # ID to property mapping bitmap - for (my $id= 1; $id <= $max_id; $id++) - { - my $row= $id_prop[$id]; - printf BM_FILE ("%09d\t", $id); - for (my $prop= 0; $prop <= $max_prop; $prop++) - # foreach my $prop (@prop_ids) - { - my $val= $row->[$prop]; - if ($val < 0 || $val > 1) - { - print "warning: invalid count id=[$id] prop=[$prop]\n"; - } - print BM_FILE ($val) ? '#' : '.'; - } - print BM_FILE "\n"; - } - # print "prop_ids: ", join (' ', @prop_ids), "\n"; + # ID to property mapping bitmap + for (my $id= 1; $id <= $max_id; $id++) + { + my $row= $id_prop[$id]; + printf BM_FILE ("%09d\t", $id); + for (my $prop= 0; $prop <= $max_prop; $prop++) + # foreach my $prop (@prop_ids) + { + my $val= $row->[$prop]; + if ($val < 0 || $val > 1) + { + print "warning: invalid count id=[$id] prop=[$prop]\n"; + } + print BM_FILE ($val) ? '#' : '.'; + } + print BM_FILE "\n"; + } + # print "prop_ids: ", join (' ', @prop_ids), "\n"; } close (BM_FILE) if ($exp_bitmap); @@ -651,6 +659,14 @@ print "cnt_authctrl: $cnt_authctrl\n"; print "started: ", scalar localtime ($t_start), "\n"; print "finished: ", scalar localtime ($t_end), "\n"; print "duration: ", $t_end-$t_start, " seconds\n"; + + if ($stats_opened) + { + print STATS "started: ", scalar localtime ($t_start), "\n"; + print STATS "finished: ", scalar localtime ($t_end), "\n"; + print STATS "duration: ", $t_end-$t_start, " seconds\n"; + close (STATS); + } } sub counter