diff --git a/.gitignore b/.gitignore index e48530fb578c0f143b599484f3a441ddf051e292..e7d115ba111aadeb1f83e8f5bb658d9985f86f89 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ data/ wkt-??/ tmp/ @* +*.swp *.tys *.items items.csv @@ -17,6 +18,7 @@ P234.csv P496.csv P625.csv P*.csv +P*.tsv Q* PDS_backing.pages latest diff --git a/lib/PDS.pm b/lib/PDS.pm index cbdc31ff6e0af3ef05e115986e479ae37f79d0b9..68241a5af1626c3665e8fcece51798f67339b114 100644 --- a/lib/PDS.pm +++ b/lib/PDS.pm @@ -39,6 +39,8 @@ my %defaults= page_hits => [], # number of times a page was loaded! ); +my $DEBUG= 0; + sub new { my $class= shift; @@ -68,9 +70,26 @@ sub new print "opened paging backing file [$self->{backing_file}] in mode [$bf_mode]\n"; $self->{__FPDS__}= *FPDS; + $self->debug_hdr() if ($DEBUG > 0); + $self; } +sub debug_hdr +{ + my $self= shift; + + print "--- 8< ---\n"; + print "caller: ", join (' ', caller()), "\n"; + printf ("paging: page_size=[0x%08lX] page_hdr_size=[0x%04X] rec_size=[0x%04X] recs_per_page=[0x%08lX] backing_file=[%s]\n", + map { $self->{$_} } qw(page_size page_hdr_size rec_size recs_per_page backing_file)); + printf ("page_info: last_page_num=[%d] highest_page_num=[%d] last_page=[%s]\n", + map { $self->{$_} } qw(last_page_num highest_page_num last_page)); + printf ("counter: page_same=[%d] page_next=[%d] page_up=[%d] page_down=[%d]\n", + map { $self->{$_} } qw(cnt_page_same cnt_page_next cnt_page_up cnt_page_down)); + print "--- >8 ---\n"; +} + sub set { my $self= shift; @@ -103,7 +122,8 @@ sub retrieve # print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n"; my $d= substr ($pdsp->{buffer}, $rel_rec_pos, $self->{rec_size}); - # print "d:\n"; main::hexdump ($d); + print "d:\n"; main::hexdump ($d); + #print "buffer:\n"; main::hexdump ($pdsp->{buffer}); $d; } @@ -113,13 +133,16 @@ sub get_page_by_rec_num my $self= shift; my $rec_num= shift; - my ($rec_size, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size last_page_num $last_page); +print "get_page_by_rec_num: rec_num=[$rec_num]\n" if ($DEBUG > 2); + my ($rec_size, $recs_per_page, $last_page_num, $last_page)= map { $self->{$_} } qw(rec_size recs_per_page last_page_num last_page); - my $page_num= int ($rec_num * $rec_size / $self->{page_size}); - my $rel_rec_num= $rec_num % $self->{recs_per_page}; + # my $page_num= int ($rec_num * $rec_size / $self->{page_size}); + my $page_num= int ($rec_num / $recs_per_page); + my $rel_rec_num= $rec_num % $recs_per_page; my $rel_rec_pos= $self->{page_hdr_size} + $rel_rec_num * $rec_size; +print "get_page_by_rec_num: page_num=[$page_num] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n" if ($DEBUG > 2); # print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n"; if ($page_num == $last_page_num) @@ -189,8 +212,9 @@ sub print_page_info { my $self= shift; - print "page_size=[$self->{page_size}]\n"; - print "recs_per_page=[$self->{recs_per_page}]\n"; + printf ("page_size=[0x%08lX]\n", $self->{page_size}); + printf ("rec_size=[0x%08lx]\n", $self->{rec_size}); + printf ("recs_per_page=[0x%08lx]\n", $self->{recs_per_page}); $self->print_page_stats(); print "highest_page_num=[$self->{highest_page_num}]\n"; @@ -203,7 +227,8 @@ sub load_page my $self= shift; my $page_num= shift; - # print "loading page_num=[$page_num]\n"; + # print '='x72, "\nloading page_num=[$page_num]\n"; + # if (0 && $page_num >= 200) { print "EXIT at page 200!\n"; exit; } my $new_page= { @@ -221,8 +246,9 @@ sub load_page local *FPDS= $self->{'__FPDS__'}; my $page_size= $self->{page_size}; + # $self->debug_hdr(); my $rc= seek(FPDS, $page_pos, 0); - # print "seek: rc=[$rc]\n"; + # printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page_pos, $rc); my $new_buffer; my $bc= sysread(FPDS, $new_buffer, $page_size); unless ($bc == $page_size) @@ -271,7 +297,7 @@ sub flush_page my ($page, $page_num)= map { $self->{$_} } qw(last_page last_page_num); - # print "flushing page_num=[$page_num]\n"; + print '='x72, "\nflushing page_num=[$page_num]\n" if ($DEBUG > 1); return undef unless ($page_num >= 0 && defined ($page)); # print "TODO: writing data page_num=[$page_num]\n"; @@ -284,8 +310,9 @@ sub flush_page my @d= @{$page->{dirty}}; my $b= $page->{buffer}; - # my $cnt_dirty= @d; - # print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n"; + my $cnt_dirty= @d; + print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n" if ($DEBUG > 1); + # $self->debug_hdr(); my $new_buffer= $self->setup_header($page_num, 0x12345678); # print "new_buffer length=[",length($new_buffer), "]\n"; @@ -325,8 +352,9 @@ sub flush_page } local *FPDS= $self->{'__FPDS__'}; + # $self->debug_hdr(); my $rc= seek(FPDS, $page->{page_pos}, 0); - # print "seek: rc=[$rc]\n"; + # printf ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page->{page_pos}, $rc); my $bc= syswrite(FPDS, $new_buffer, $page_size); unless ($bc == $page_size) { diff --git a/wdq1.pl b/wdq1.pl index a9d0da2dc7a37b1cea064a011f419acd6b0bd7e5..66324e751117d6f9689291189c9aad77f85d02b3 100755 --- a/wdq1.pl +++ b/wdq1.pl @@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated # not used my $LR_max_propid= 1930; # dump from 20150608 my $seq= 'a'; -my $date= '2016-08-16'; # maybe a config file is in order to set up the defaults... +my $date= '2016-08-22'; # maybe a config file should be used to set up the defaults... my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my $upd_paths= 0; @@ -173,12 +173,16 @@ my %filters= 'P361' => wdpf ('P361', 'part of', 1), 'P1269' => wdpf ('P1269', 'facet of', 1), - # person identifiers + # item identifer (persons, places, etc.) + 'P213' => wdpf ('P213', 'ISNI'), # International Standard Name Identifier for an identity 'P227' => wdpf ('P227', 'GND identifier'), + 'P244' => wdpf ('P244', 'LCAuth ID'), # Library of Congress ID for authority control (for books use P1144) + 'P1245' => wdpf ('P1245', 'OmegaWiki Defined Meaning'), # "Defined Meaning" on the site OmegaWiki + + # person identifiers 'P214' => wdpf ('P214', 'VIAF identifier'), 'P496' => wdpf ('P496', 'ORCID identifier'), - - 'P213' => wdpf ('P213', 'ISNI'), # check + 'P2280' => wdpf ('P2280', 'Austrian Parliament ID'), # identifier for an individual, in the Austrian Parliament's "Who's Who" database # personal data? 'P569' => wdpf ('P569', 'Date of birth'), @@ -186,17 +190,19 @@ my %filters= 'P2298' => wdpf ('P2298', 'NSDAP membership number (1925-1945)'), # publications - 'P345' => wdpf ('P345', 'IMDb identifier'), 'P212' => wdpf ('P212', 'ISBN-13'), 'P236' => wdpf ('P212', 'ISSN'), + 'P345' => wdpf ('P345', 'IMDb identifier'), + 'P356' => wdpf ('P356', 'DOI'), + 'P698' => wdpf ('P698', 'PubMed ID'), # identifier for journal articles/abstracts in PubMed 'P957' => wdpf ('P957', 'ISBN-10'), + 'P3035' => wdpf ('P3035', 'ISBN publisher prefix'), # ISBN publisher prefix # arXiv.org 'P818' => wdpf ('P818', 'arXiv ID'), 'P820' => wdpf ('P820', 'arXiv classification'), # permanent identifiers - 'P356' => wdpf ('P356', 'DOI'), 'P1184' => wdpf ('P1184', 'Handle'), 'P727' => wdpf ('P727', 'Europeana ID'), 'P1036' => wdpf ('P1036', 'Dewey Decimal Classification'), @@ -217,8 +223,10 @@ my %filters= 'P436' => wdpf ('P436', 'MusicBrainz release group id'), 'P1004' => wdpf ('P1004', 'MusicBrainz place id'), - # misc. - 'P625' => wdpf ('P625', 'Geo Coordinates'), + # Geography + 'P625' => wdpf ('P625', 'Geo Coordinates'), + '1566' => wdpf ('P1566', 'GeoNames ID'), + 'P964' => wdpf ('P964', 'Austrian municipality key'), # identifier for municipalities in Austria # chemistry 'P233' => wdpf ('P233', 'SMILES'), # Simplified Molecular Input Line Entry Specification @@ -240,6 +248,24 @@ my %filters= 'P1072' => wdpf ('P1072' => 'readable file format'), 'P1073' => wdpf ('P1073' => 'writable file format'), 'P1195' => wdpf ('P1195' => 'file extension'), + + # external-id + 'P503' => wdpf ('P503' => 'ISO standard'), # number of the ISO standard which normalizes the object + + # URLs + 'P854' => wdpf ('P854' => 'reference URL'), + 'P856' => wdpf ('P856' => 'official website'), + 'P953' => wdpf ('P953' => 'full text available at'), + 'P973' => wdpf ('P973' => 'described at URL'), + 'P1019' => wdpf ('P1019' => 'feed URL'), + 'P1065' => wdpf ('P1065' => 'archive URL'), + 'P1324' => wdpf ('P1324' => 'source code repository'), + 'P1325' => wdpf ('P1325' => 'external data available at'), + 'P1401' => wdpf ('P1401' => 'bug tracking system'), + 'P1581' => wdpf ('P1581' => 'official blog'), + 'P2699' => wdpf ('P2699' => 'URL'), + + # '' => wdpf ('' => ''), ); my @filters= sort keys %filters; diff --git a/wdq2.pl b/wdq2.pl index 33f04670ee82dbfc4188248df95706932b34fcdb..7e99f12977e705ee8d581824853f155dd71ab482 100755 --- a/wdq2.pl +++ b/wdq2.pl @@ -20,7 +20,7 @@ use Wiktionary::Utils; use PDS; my $seq= 'a'; -my $date= '2016-07-04'; +my $date= '2016-08-22'; my $lang= undef; my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq); my $cmp_fnm_pattern= '%s/wdq%05d.cmp'; @@ -127,7 +127,7 @@ sub scan_items # print "index: ", Dumper ($index); my ($idx_id, $idx_fo_num, $idx_pos_beg, $idx_pos_end)= map { $index->{$_} } qw(id fo_count fo_pos_beg fo_pos_end); - print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n"; + # print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n"; my $columns= $csv->{'columns'}; # print "columns: ", Dumper ($columns); @@ -280,6 +280,7 @@ sub get_items my $cnt_items= 0; foreach my $rec_num (sort { $a <=> $b } @rec_nums) { + print "rec_num=[$rec_num]\n"; my $data= $pds->retrieve ($rec_num); # main::hexdump ($data); my ($x_rec_num, $pos_idx, $f_num, $beg, $end, @x)= unpack ('LLLLLLLL', $data); @@ -292,6 +293,7 @@ sub get_items fo_pos_beg => $beg, fo_pos_end => $end, }; + print "row: ", Dumper ($row); if ($x_rec_num > 0) {