Skip to content
Snippets Groups Projects
Commit 605effa5 authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

debugging and messaging changed; added properties to extract

parent 42fff9f9
No related branches found
No related tags found
No related merge requests found
Pipeline #2 failed
......@@ -70,7 +70,7 @@ notify('starting wdq0 loop');
while (1)
{
my $dumps= check_dump();
# print "dumps: ", Dumper ($dumps);
print "dumps: ", Dumper ($dumps);
foreach my $dump (@$dumps)
{
fetch_and_convert ($dump->{date}, $seq, $dump->{size});
......@@ -87,7 +87,9 @@ sub notify
{
my $msg= shift;
system (qw(notify-sms.pl gg-uni), $msg);
print "NOTIFY: [$msg]\n";
system (qw(notify-sms.pl gg-uni), scalar localtime(time()), $msg);
sleep(1);
}
sub fetch_and_convert
......@@ -105,7 +107,7 @@ sub fetch_and_convert
}
else
{
print "fetching stuff for $date\n";
print "fetching stuff for date=$date seq=$seq data_dir=[$data_dir]\n";
notify("wdq0: about to fetch dump for $date");
my ($fetched, $dump_file)= fetch_dump ($date);
......@@ -195,13 +197,15 @@ sub check_dump
print "cmd_fetch=[$cmd_fetch]\n";
open (LST, '-|', $cmd_fetch) or die "can't run $cmd_fetch";
my @res;
while (<LST>)
LST: while (<LST>)
{
chop;
if (m#<a href="((\d{4})(\d{2})(\d{2})\.json\.gz)">(\d{8}\.json\.gz)</a>\s+(\S+)\s+(\S+)\s+(\d+)#)
{
my ($f1, $year, $mon, $day, $f2, $xdate, $time, $size)= ($1, $2, $3, $4, $5, $6, $7, $8);
print "year=[$year] mon=[$mon] day=[$day] f1=[$f1] f2=[$f2] xdate=[$xdate] time=[$time] size=[$size]\n";
next LST if ($size <= 63);
next LST if ($size <= 30_000_000_000);
my $rec=
{
dump_file => $f1,
......
......@@ -103,11 +103,10 @@ sub analyze_wikidata_dump
{
my $fnm= shift;
open (DIAG, '>:utf8', '@diag') or die;
# statistics
my %types;
my %attrs;
my %count_snaktype;
# item statistics
my %lang_labels;
......@@ -118,6 +117,20 @@ my %name_sitelinks;
my %props;
unless (-d $data_dir)
{
print "mkdir $data_dir\n";
mkdir ($data_dir);
}
unless (-d $out_dir)
{
print "mkdir $out_dir\n";
mkdir ($out_dir)
}
my $diag_file= $data_dir.'/@diag';
open (DIAG, '>:utf8', $diag_file) or die "can't write diag file=[$diag_file]";
my @item_attrs= qw(labels descriptions aliases claims sitelinks);
my $running= 1;
......@@ -137,17 +150,6 @@ my %props;
my $line= 0;
my $t_start= time();
unless (-d $data_dir)
{
print "mkdir $data_dir\n";
mkdir ($data_dir);
}
unless (-d $out_dir)
{
print "mkdir $out_dir\n";
mkdir ($out_dir)
}
# item list
my $fnm_items= $data_dir . '/items.csv';
......@@ -155,7 +157,7 @@ local *FO_ITEMS;
open (FO_ITEMS, '>:utf8', $fnm_items) or die "can't write to [$fnm_items]";
my @cols1= qw(line pos fo_count fo_pos_beg fo_pos_end id type cnt_label cnt_desc cnt_aliases cnt_claims cnt_sitelink lang label);
print FO_ITEMS join ($TSV_SEP, @cols1, qw(filtered_props claims)), "\n";
autoflush FO_ITEMS 1;
# autoflush FO_ITEMS 1;
# properties
my @cols_filt= (@cols1, 'val');
......@@ -190,7 +192,9 @@ my %filters=
# person identifiers
'P214' => wdpf ('P214', 'VIAF identifier'),
'P496' => wdpf ('P496', 'ORCID identifier'),
'P651' => wdpf ('P651', 'Biografisch Portaal number'), # identifier at Biografisch Portaal van Nederland
'P2280' => wdpf ('P2280', 'Austrian Parliament ID'), # identifier for an individual, in the Austrian Parliament's "Who's Who" database
'P3421' => wdpf ('P3421', 'Belvedere artist ID'), # identifier assigned to an artist by the Österreichische Galerie Belvedere in Vienna
# personal data?
'P569' => wdpf ('P569', 'Date of birth'),
......@@ -229,10 +233,31 @@ my %filters=
'P434' => wdpf ('P434', 'MusicBrainz artist id'),
'P435' => wdpf ('P435', 'MusicBrainz work id'),
'P436' => wdpf ('P436', 'MusicBrainz release group id'),
'P966' => wdpf ('P966', 'MusicBrainz label ID'),
'P982' => wdpf ('P982', 'MusicBrainz area ID'),
'P1004' => wdpf ('P1004', 'MusicBrainz place id'),
# BookBrainz
'P1407' => wdpf ('P1407', 'MusicBrainz series id'),
'P4404' => wdpf ('P4404', 'MusicBrainz recording id'),
'P5813' => wdpf ('P5813', 'MusicBrainz release id'),
# AllMusic
'P1728' => wdpf ('P1728', 'AllMusic artist ID'),
'P1729' => wdpf ('P1728', 'AllMusic album ID'),
'P1730' => wdpf ('P1730', 'AllMusic song ID'),
'P1994' => wdpf ('P1994', 'AllMusic composition ID'),
'P6110' => wdpf ('P6110', 'AllMusic release ID'),
'P6306' => wdpf ('P6306', 'AllMusic performance ID'),
# Google Play Music
'P4198' => wdpf ('P4198', 'Google Play Music artist ID'),
'P4199' => wdpf ('P4199', 'Google Play Music album ID'),
# Amazon Music database
'P6276' => wdpf ('P6276', 'Amazon Music artist ID'),
# Books
'P2607' => wdpf ('P2607', 'BookBrainz creator ID'), # identifier for a creator per the BookBrainz open book encyclopedia
'P123' => wdpf ('P123', 'publisher'), # organization or person responsible for publishing books, periodicals, games or software
# WorldCat
'P2163' => wdpf ('P163', 'FAST-ID'), # authority control identifier in WorldCat's “FAST Linked Data” authority file
......@@ -282,6 +307,10 @@ my %filters=
'P1581' => wdpf ('P1581' => 'official blog'),
'P2699' => wdpf ('P2699' => 'URL'),
# other person identifiers
'P5246' => wdpf ('P5246' => 'Pornhub ID'),
'P5267' => wdpf ('P5267' => 'YouPorn ID'),
'P5540' => wdpf ('P5540' => 'RedTube ID'),
# '' => wdpf ('' => ''),
);
my @filters= sort keys %filters;
......@@ -498,19 +527,35 @@ my $fo_count= $fo_rec->open();
my $p= $jc->{$property};
# print "p: ", Dumper ($p);
my $ms;
eval { $ms= $p->[0]->{mainsnak} };
if ($@)
{
print DIAG "id=$id ERROR: no mainsnak element; property=[$property] e=[$@] property=", Dumper ($p);
next PROP;
}
my $snaktype= $ms->{snaktype};
$count_snaktype{$snaktype}++;
if ($snaktype ne 'value')
{
print DIAG "id=$id NOTE: snaktype=[$snaktype], property=[$property]\n";
next PROP;
}
my $x;
eval { $x= $p->[0]->{'mainsnak'}->{'datavalue'}->{'value'} };
eval { $x= $ms->{datavalue}->{value} };
# print "x: ", Dumper ($x); # exit;
if ($@)
{
print DIAG "id=$id error: property=[$property] $x=[$x] e=[$@] property=", Dumper ($p);
print DIAG "id=$id ERROR: no value element; property=[$property] $x=[$x] e=[$@] property=", Dumper ($p);
next PROP;
}
elsif (!defined ($x))
{
print DIAG "id=$id undef x: property=[$property] property=", Dumper ($p);
print DIAG "id=$id NOTE: undef property value: property=[$property] property=", Dumper ($p);
next PROP;
}
......@@ -633,6 +678,7 @@ my $fo_count= $fo_rec->open();
print STATS "lines: $line\n";
print STATS "fo_count: $fo_count\n";
print STATS "cnt_authctrl: $cnt_authctrl\n";
print STATS "snaktypes: ", Dumper (\%count_snaktype);
}
if ($exp_bitmap == 1)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment