Skip to content
Snippets Groups Projects
Commit e3c65b0a authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

refactor for authority control extraction

parent 02fe41dd
No related branches found
No related tags found
No related merge requests found
...@@ -74,8 +74,15 @@ sub extract ...@@ -74,8 +74,15 @@ sub extract
my $x= shift; my $x= shift;
my $y; my $y;
_extract ($x, $fp->{'transform'});
}
sub _extract
{
my $x= shift;
my $transform= shift;
if ($fp->{'transform'} == 1 && ref ($x) eq 'HASH') if ($transform == 1 && ref ($x) eq 'HASH')
{ {
my $et; my $et;
if ($x->{'entity-type'} eq 'item') { $et= 'Q'; } if ($x->{'entity-type'} eq 'item') { $et= 'Q'; }
......
...@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated ...@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated
# not used my $LR_max_propid= 1930; # dump from 20150608 # not used my $LR_max_propid= 1930; # dump from 20150608
my $seq= 'a'; my $seq= 'a';
my $date= '2016-08-22'; # maybe a config file should be used to set up the defaults... my $date= '2016-12-12'; # maybe a config file should be used to set up the defaults...
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $upd_paths= 0; my $upd_paths= 0;
...@@ -116,6 +116,9 @@ my %props; ...@@ -116,6 +116,9 @@ my %props;
my @item_attrs= qw(labels descriptions aliases claims sitelinks); my @item_attrs= qw(labels descriptions aliases claims sitelinks);
my $running= 1;
$SIG{INT}= sub { $running= 0; };
# local *FI= wkutils::open_input($fnm); # local *FI= wkutils::open_input($fnm);
if ($fnm =~ /\.gz$/) if ($fnm =~ /\.gz$/)
{ {
...@@ -223,6 +226,12 @@ my %filters= ...@@ -223,6 +226,12 @@ my %filters=
'P436' => wdpf ('P436', 'MusicBrainz release group id'), 'P436' => wdpf ('P436', 'MusicBrainz release group id'),
'P1004' => wdpf ('P1004', 'MusicBrainz place id'), 'P1004' => wdpf ('P1004', 'MusicBrainz place id'),
# BookBrainz
'P2607' => wdpf ('P2607', 'BookBrainz creator ID'), # identifier for a creator per the BookBrainz open book encyclopedia
# WorldCat
'P2163' => wdpf ('P163', 'FAST-ID'), # authority control identifier in WorldCat's “FAST Linked Data” authority file
# Geography # Geography
'P625' => wdpf ('P625', 'Geo Coordinates'), 'P625' => wdpf ('P625', 'Geo Coordinates'),
'1566' => wdpf ('P1566', 'GeoNames ID'), '1566' => wdpf ('P1566', 'GeoNames ID'),
...@@ -269,6 +278,20 @@ my %filters= ...@@ -269,6 +278,20 @@ my %filters=
); );
my @filters= sort keys %filters; my @filters= sort keys %filters;
# Authority Control
my @authctrl= qw(P213 P214 P227 P244 P496);
my %authctrl= map { $_ => 1 } @authctrl;
my $fnm_authctrl= $data_dir . '/authctrl.json';
local *FO_AUTHCTRL;
open (FO_AUTHCTRL, '>:utf8', $fnm_authctrl) or die "can't write to [$fnm_authctrl]";
autoflush FO_AUTHCTRL 1;
print FO_AUTHCTRL "[\n";
my $cnt_authctrl= 0;
# properties
# Property Bitmap Table # Property Bitmap Table
my @id_prop= (); # bitmap table my @id_prop= (); # bitmap table
my $max_id= -1; my $max_id= -1;
...@@ -287,7 +310,7 @@ my $fo_pos= 0; ...@@ -287,7 +310,7 @@ my $fo_pos= 0;
<FI>; <FI>;
my $pos; my $pos;
LINE: while (1) LINE: while ($running)
{ {
$pos= tell(FI); $pos= tell(FI);
my $l= <FI>; my $l= <FI>;
...@@ -401,6 +424,25 @@ LINE: while (1) ...@@ -401,6 +424,25 @@ LINE: while (1)
my @found_properties= (); my @found_properties= ();
my @bm_row=(); for (my $i= 0; $i <= $max_prop; $i++) { $bm_row[$i]='.' } my @bm_row=(); for (my $i= 0; $i <= $max_prop; $i++) { $bm_row[$i]='.' }
# Authority Control
my $authctrl;
if ($ty eq 'item')
{
foreach my $x (@authctrl)
{
if (exists ($jc->{$x}))
{
$authctrl=
{
'id' => $id,
'tlt_l' => \%tlt_l,
'tlt_d' => \%tlt_d,
};
last;
}
}
}
# foreach my $property (@filters) # foreach my $property (@filters)
PROP: foreach my $property (@all_properties) PROP: foreach my $property (@all_properties)
{ {
...@@ -418,11 +460,6 @@ LINE: while (1) ...@@ -418,11 +460,6 @@ LINE: while (1)
$id_prop[$id_num]->[$prop_num]++ if ($exp_bitmap == 1); $id_prop[$id_num]->[$prop_num]++ if ($exp_bitmap == 1);
$bm_row[$prop_num]='#' if ($exp_bitmap == 2); $bm_row[$prop_num]='#' if ($exp_bitmap == 2);
# if (exists ($jc->{$property}))
if (exists ($filters{$property}))
{
my $fp= $filters{$property};
# print "fp: ", Dumper ($fp);
my $p= $jc->{$property}; my $p= $jc->{$property};
# print "p: ", Dumper ($p); # print "p: ", Dumper ($p);
...@@ -434,17 +471,24 @@ LINE: while (1) ...@@ -434,17 +471,24 @@ LINE: while (1)
if ($@) if ($@)
{ {
print DIAG "id=$id error: property=[$property] $x=[$x] e=[$@] property=", Dumper ($p); print DIAG "id=$id error: property=[$property] $x=[$x] e=[$@] property=", Dumper ($p);
next PROP;
} }
elsif (!defined ($x)) elsif (!defined ($x))
{ {
print DIAG "id=$id undef x: property=[$property] property=", Dumper ($p); print DIAG "id=$id undef x: property=[$property] property=", Dumper ($p);
next PROP;
} }
else
my $y;
if (exists ($filters{$property}))
{ {
my $fp= $filters{$property};
# print "fp: ", Dumper ($fp);
# ZZZ # ZZZ
push (@found_properties, $property); push (@found_properties, $property);
my $y= $fp->extract($x); $y= $fp->extract($x);
local *FO_p= $fp->{'_FO'}; local *FO_p= $fp->{'_FO'};
print FO_p join ($TSV_SEP, print FO_p join ($TSV_SEP,
...@@ -455,6 +499,14 @@ LINE: while (1) ...@@ -455,6 +499,14 @@ LINE: while (1)
$y, $y,
), "\n"; ), "\n";
} }
else
{
$y= WikiData::Property::Filter::_extract ($x, (ref($x) eq 'HASH') ? 1 : 0);
}
if (defined ($authctrl))
{ # collect all filtered properties for the authority record
$authctrl->{$property}= $y;
} }
} }
...@@ -475,6 +527,14 @@ LINE: while (1) ...@@ -475,6 +527,14 @@ LINE: while (1)
print BM_FILE join ('', @bm_row); print BM_FILE join ('', @bm_row);
print BM_FILE "\n"; print BM_FILE "\n";
if (defined ($authctrl))
{
print FO_AUTHCTRL ",\n" if ($cnt_authctrl);
print FO_AUTHCTRL to_json($authctrl);
$cnt_authctrl++;
print "$cnt_authctrl authority control records\n" if (($cnt_authctrl % 1000) == 0);
}
last if (defined ($MAX_INPUT_LINES) && $line >= $MAX_INPUT_LINES); ### DEBUG last if (defined ($MAX_INPUT_LINES) && $line >= $MAX_INPUT_LINES); ### DEBUG
# $pos= tell(FI); # $pos= tell(FI);
} }
...@@ -482,6 +542,10 @@ LINE: while (1) ...@@ -482,6 +542,10 @@ LINE: while (1)
close (FI); close (FI);
$fo_rec->close(); $fo_rec->close();
print "$cnt_authctrl authority records written to $fnm_authctrl\n";
print FO_AUTHCTRL "\n]\n";
close (FO_AUTHCTRL);
# check if there are multiple definitions of the same property and flatten the structure a bit # check if there are multiple definitions of the same property and flatten the structure a bit
open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die; open (PROPS_LIST, '>:utf8', $data_dir . '/props.csv') or die;
print PROPS_LIST join ($TSV_SEP, qw(prop def_cnt use_cnt datatype label_en descr_en)), "\n"; print PROPS_LIST join ($TSV_SEP, qw(prop def_cnt use_cnt datatype label_en descr_en)), "\n";
...@@ -527,6 +591,7 @@ print "max_id: $max_id\n"; ...@@ -527,6 +591,7 @@ print "max_id: $max_id\n";
print "max_prop: $max_prop\n"; print "max_prop: $max_prop\n";
print "lines: $line\n"; print "lines: $line\n";
print "fo_count: $fo_count\n"; print "fo_count: $fo_count\n";
print "cnt_authctrl: $cnt_authctrl\n";
if ($exp_bitmap == 1) if ($exp_bitmap == 1)
{ {
......
...@@ -20,7 +20,7 @@ use Wiktionary::Utils; ...@@ -20,7 +20,7 @@ use Wiktionary::Utils;
use PDS; use PDS;
my $seq= 'a'; my $seq= 'a';
my $date= '2016-08-22'; my $date= '2016-12-05';
my $lang= undef; my $lang= undef;
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $cmp_fnm_pattern= '%s/wdq%05d.cmp'; my $cmp_fnm_pattern= '%s/wdq%05d.cmp';
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment