From e3c65b0ad2138f8c5e5f4ac07d5fe1472980cb41 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Tue, 13 Dec 2016 09:56:24 +0100 Subject: [PATCH] refactor for authority control extraction --- lib/WikiData/Property/Filter.pm | 9 ++- wdq1.pl | 99 +++++++++++++++++++++++++++------ wdq2.pl | 2 +- 3 files changed, 91 insertions(+), 19 deletions(-) diff --git a/lib/WikiData/Property/Filter.pm b/lib/WikiData/Property/Filter.pm index 276c719..e2c9272 100644 --- a/lib/WikiData/Property/Filter.pm +++ b/lib/WikiData/Property/Filter.pm @@ -74,8 +74,15 @@ sub extract my $x= shift; my $y; + _extract ($x, $fp->{'transform'}); +} + +sub _extract +{ + my $x= shift; + my $transform= shift; - if ($fp->{'transform'} == 1 && ref ($x) eq 'HASH') + if ($transform == 1 && ref ($x) eq 'HASH') { my $et; if ($x->{'entity-type'} eq 'item') { $et= 'Q'; } diff --git a/wdq1.pl b/wdq1.pl index 66324e7..c5d1454 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated # not used my $LR_max_propid= 1930; # dump from 20150608 my $seq= 'a'; -my $date= '2016-08-22'; # maybe a config file should be used to set up the defaults... +my $date= '2016-12-12'; # maybe a config file should be used to set up the defaults... my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my $upd_paths= 0; @@ -116,6 +116,9 @@ my %props; my @item_attrs= qw(labels descriptions aliases claims sitelinks); +my $running= 1; +$SIG{INT}= sub { $running= 0; }; + # local *FI= wkutils::open_input($fnm); if ($fnm =~ /\.gz$/) { @@ -223,6 +226,12 @@ my %filters= 'P436' => wdpf ('P436', 'MusicBrainz release group id'), 'P1004' => wdpf ('P1004', 'MusicBrainz place id'), + # BookBrainz + 'P2607' => wdpf ('P2607', 'BookBrainz creator ID'), # identifier for a creator per the BookBrainz open book encyclopedia + + # WorldCat + 'P2163' => wdpf ('P163', 'FAST-ID'), # authority control identifier in WorldCat's “FAST Linked Data” authority file + # Geography 'P625' => wdpf ('P625', 'Geo Coordinates'), '1566' => wdpf ('P1566', 'GeoNames ID'), @@ -269,6 +278,20 @@ my %filters= ); my @filters= sort keys %filters; +# Authority Control +my @authctrl= qw(P213 P214 P227 P244 P496); +my %authctrl= map { $_ => 1 } @authctrl; + +my $fnm_authctrl= $data_dir . '/authctrl.json'; + +local *FO_AUTHCTRL; +open (FO_AUTHCTRL, '>:utf8', $fnm_authctrl) or die "can't write to [$fnm_authctrl]"; +autoflush FO_AUTHCTRL 1; +print FO_AUTHCTRL "[\n"; +my $cnt_authctrl= 0; + +# properties + # Property Bitmap Table my @id_prop= (); # bitmap table my $max_id= -1; @@ -287,7 +310,7 @@ my $fo_pos= 0; <FI>; my $pos; -LINE: while (1) +LINE: while ($running) { $pos= tell(FI); my $l= <FI>; @@ -401,6 +424,25 @@ LINE: while (1) my @found_properties= (); my @bm_row=(); for (my $i= 0; $i <= $max_prop; $i++) { $bm_row[$i]='.' } + # Authority Control + my $authctrl; + if ($ty eq 'item') + { + foreach my $x (@authctrl) + { + if (exists ($jc->{$x})) + { + $authctrl= + { + 'id' => $id, + 'tlt_l' => \%tlt_l, + 'tlt_d' => \%tlt_d, + }; + last; + } + } + } + # foreach my $property (@filters) PROP: foreach my $property (@all_properties) { @@ -418,11 +460,6 @@ LINE: while (1) $id_prop[$id_num]->[$prop_num]++ if ($exp_bitmap == 1); $bm_row[$prop_num]='#' if ($exp_bitmap == 2); - # if (exists ($jc->{$property})) - if (exists ($filters{$property})) - { - my $fp= $filters{$property}; - # print "fp: ", Dumper ($fp); my $p= $jc->{$property}; # print "p: ", Dumper ($p); @@ -434,27 +471,42 @@ LINE: while (1) if ($@) { print DIAG "id=$id error: property=[$property] $x=[$x] e=[$@] property=", Dumper ($p); + next PROP; } elsif (!defined ($x)) { print DIAG "id=$id undef x: property=[$property] property=", Dumper ($p); + next PROP; } - else - { + + my $y; + if (exists ($filters{$property})) + { + my $fp= $filters{$property}; + # print "fp: ", Dumper ($fp); + # ZZZ push (@found_properties, $property); - my $y= $fp->extract($x); + $y= $fp->extract($x); local *FO_p= $fp->{'_FO'}; print FO_p join ($TSV_SEP, - $line, $pos, $fo_count, $fo_pos, $fo_pos_end, - $id, $ty, - $c_jl, $c_jd, $c_ja, $c_jc, $c_js, # counters - $lang_l, $pref_l, - $y, - ), "\n"; - } + $line, $pos, $fo_count, $fo_pos, $fo_pos_end, + $id, $ty, + $c_jl, $c_jd, $c_ja, $c_jc, $c_js, # counters + $lang_l, $pref_l, + $y, + ), "\n"; + } + else + { + $y= WikiData::Property::Filter::_extract ($x, (ref($x) eq 'HASH') ? 1 : 0); + } + + if (defined ($authctrl)) + { # collect all filtered properties for the authority record + $authctrl->{$property}= $y; } } @@ -475,6 +527,14 @@ LINE: while (1) print BM_FILE join ('', @bm_row); print BM_FILE "\n"; + if (defined ($authctrl)) + { + print FO_AUTHCTRL ",\n" if ($cnt_authctrl); + print FO_AUTHCTRL to_json($authctrl); + $cnt_authctrl++; + print "$cnt_authctrl authority control records\n" if (($cnt_authctrl % 1000) == 0); + } + last if (defined ($MAX_INPUT_LINES) && $line >= $MAX_INPUT_LINES); ### DEBUG # $pos= tell(FI); } @@ -482,6 +542,10 @@ LINE: while (1) close (FI); $fo_rec->close(); +print "$cnt_authctrl authority records written to $fnm_authctrl\n"; +print FO_AUTHCTRL "\n]\n"; +close (FO_AUTHCTRL); + # check if there are multiple definitions of the same property and flatten the structure a bit open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die; print PROPS_LIST join ($TSV_SEP, qw(prop def_cnt use_cnt datatype label_en descr_en)), "\n"; @@ -527,6 +591,7 @@ print "max_id: $max_id\n"; print "max_prop: $max_prop\n"; print "lines: $line\n"; print "fo_count: $fo_count\n"; +print "cnt_authctrl: $cnt_authctrl\n"; if ($exp_bitmap == 1) { diff --git a/wdq2.pl b/wdq2.pl index 7e99f12..f042161 100755 --- a/wdq2.pl +++ b/wdq2.pl @@ -20,7 +20,7 @@ use Wiktionary::Utils; use PDS; my $seq= 'a'; -my $date= '2016-08-22'; +my $date= '2016-12-05'; my $lang= undef; my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my $cmp_fnm_pattern= '%s/wdq%05d.cmp'; -- GitLab