Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
wikidata-dump-processor
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gerhard Gonter
wikidata-dump-processor
Commits
02fe41dd
Commit
02fe41dd
authored
8 years ago
by
Gerhard Gonter
Browse files
Options
Downloads
Patches
Plain Diff
fixed incorrect page_num calculation and thus index corruption
parent
85a7d892
No related branches found
No related tags found
No related merge requests found
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
.gitignore
+2
-0
2 additions, 0 deletions
.gitignore
lib/PDS.pm
+40
-12
40 additions, 12 deletions
lib/PDS.pm
wdq1.pl
+34
-8
34 additions, 8 deletions
wdq1.pl
wdq2.pl
+4
-2
4 additions, 2 deletions
wdq2.pl
with
80 additions
and
22 deletions
.gitignore
+
2
−
0
View file @
02fe41dd
...
@@ -4,6 +4,7 @@ data/
...
@@ -4,6 +4,7 @@ data/
wkt-??/
wkt-??/
tmp/
tmp/
@*
@*
*.swp
*.tys
*.tys
*.items
*.items
items.csv
items.csv
...
@@ -17,6 +18,7 @@ P234.csv
...
@@ -17,6 +18,7 @@ P234.csv
P496.csv
P496.csv
P625.csv
P625.csv
P*.csv
P*.csv
P*.tsv
Q*
Q*
PDS_backing.pages
PDS_backing.pages
latest
latest
...
...
This diff is collapsed.
Click to expand it.
lib/PDS.pm
+
40
−
12
View file @
02fe41dd
...
@@ -39,6 +39,8 @@ my %defaults=
...
@@ -39,6 +39,8 @@ my %defaults=
page_hits
=>
[]
,
# number of times a page was loaded!
page_hits
=>
[]
,
# number of times a page was loaded!
);
);
my
$DEBUG
=
0
;
sub
new
sub
new
{
{
my
$class
=
shift
;
my
$class
=
shift
;
...
@@ -68,9 +70,26 @@ sub new
...
@@ -68,9 +70,26 @@ sub new
print
"
opened paging backing file [
$self
->{backing_file}] in mode [
$bf_mode
]
\n
";
print
"
opened paging backing file [
$self
->{backing_file}] in mode [
$bf_mode
]
\n
";
$self
->
{
__FPDS__
}
=
*FPDS
;
$self
->
{
__FPDS__
}
=
*FPDS
;
$self
->
debug_hdr
()
if
(
$DEBUG
>
0
);
$self
;
$self
;
}
}
sub
debug_hdr
{
my
$self
=
shift
;
print
"
--- 8< ---
\n
";
print
"
caller:
",
join
('
',
caller
()),
"
\n
";
printf
("
paging: page_size=[0x%08lX] page_hdr_size=[0x%04X] rec_size=[0x%04X] recs_per_page=[0x%08lX] backing_file=[%s]
\n
",
map
{
$self
->
{
$_
}
}
qw(page_size page_hdr_size rec_size recs_per_page backing_file)
);
printf
("
page_info: last_page_num=[%d] highest_page_num=[%d] last_page=[%s]
\n
",
map
{
$self
->
{
$_
}
}
qw(last_page_num highest_page_num last_page)
);
printf
("
counter: page_same=[%d] page_next=[%d] page_up=[%d] page_down=[%d]
\n
",
map
{
$self
->
{
$_
}
}
qw(cnt_page_same cnt_page_next cnt_page_up cnt_page_down)
);
print
"
--- >8 ---
\n
";
}
sub
set
sub
set
{
{
my
$self
=
shift
;
my
$self
=
shift
;
...
@@ -103,7 +122,8 @@ sub retrieve
...
@@ -103,7 +122,8 @@ sub retrieve
# print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n";
# print "pdsp: rec_num=[$rec_num] page_num=[$pdsp->{page_num}] rel_rec_num=[$rel_rec_num] rel_rec_pos=[$rel_rec_pos]\n";
my
$d
=
substr
(
$pdsp
->
{
buffer
},
$rel_rec_pos
,
$self
->
{
rec_size
});
my
$d
=
substr
(
$pdsp
->
{
buffer
},
$rel_rec_pos
,
$self
->
{
rec_size
});
# print "d:\n"; main::hexdump ($d);
print
"
d:
\n
";
main::
hexdump
(
$d
);
#print "buffer:\n"; main::hexdump ($pdsp->{buffer});
$d
;
$d
;
}
}
...
@@ -113,13 +133,16 @@ sub get_page_by_rec_num
...
@@ -113,13 +133,16 @@ sub get_page_by_rec_num
my
$self
=
shift
;
my
$self
=
shift
;
my
$rec_num
=
shift
;
my
$rec_num
=
shift
;
my
(
$rec_size
,
$last_page_num
,
$last_page
)
=
map
{
$self
->
{
$_
}
}
qw(rec_size last_page_num $last_page)
;
print
"
get_page_by_rec_num: rec_num=[
$rec_num
]
\n
"
if
(
$DEBUG
>
2
);
my
(
$rec_size
,
$recs_per_page
,
$last_page_num
,
$last_page
)
=
map
{
$self
->
{
$_
}
}
qw(rec_size recs_per_page last_page_num last_page)
;
my
$page_num
=
int
(
$rec_num
*
$rec_size
/
$self
->
{
page_size
});
# my $page_num= int ($rec_num * $rec_size / $self->{page_size});
my
$rel_rec_num
=
$rec_num
%
$self
->
{
recs_per_page
};
my
$page_num
=
int
(
$rec_num
/
$recs_per_page
);
my
$rel_rec_num
=
$rec_num
%
$recs_per_page
;
my
$rel_rec_pos
=
$self
->
{
page_hdr_size
}
+
$rel_rec_num
*
$rec_size
;
my
$rel_rec_pos
=
$self
->
{
page_hdr_size
}
+
$rel_rec_num
*
$rec_size
;
print
"
get_page_by_rec_num: page_num=[
$page_num
] rel_rec_num=[
$rel_rec_num
] rel_rec_pos=[
$rel_rec_pos
]
\n
"
if
(
$DEBUG
>
2
);
# print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n";
# print __LINE__, " rec_num=[$rec_num] page_num=[$page_num]\n";
if
(
$page_num
==
$last_page_num
)
if
(
$page_num
==
$last_page_num
)
...
@@ -189,8 +212,9 @@ sub print_page_info
...
@@ -189,8 +212,9 @@ sub print_page_info
{
{
my
$self
=
shift
;
my
$self
=
shift
;
print
"
page_size=[
$self
->{page_size}]
\n
";
printf
("
page_size=[0x%08lX]
\n
",
$self
->
{
page_size
});
print
"
recs_per_page=[
$self
->{recs_per_page}]
\n
";
printf
("
rec_size=[0x%08lx]
\n
",
$self
->
{
rec_size
});
printf
("
recs_per_page=[0x%08lx]
\n
",
$self
->
{
recs_per_page
});
$self
->
print_page_stats
();
$self
->
print_page_stats
();
print
"
highest_page_num=[
$self
->{highest_page_num}]
\n
";
print
"
highest_page_num=[
$self
->{highest_page_num}]
\n
";
...
@@ -203,7 +227,8 @@ sub load_page
...
@@ -203,7 +227,8 @@ sub load_page
my
$self
=
shift
;
my
$self
=
shift
;
my
$page_num
=
shift
;
my
$page_num
=
shift
;
# print "loading page_num=[$page_num]\n";
# print '='x72, "\nloading page_num=[$page_num]\n";
# if (0 && $page_num >= 200) { print "EXIT at page 200!\n"; exit; }
my
$new_page
=
my
$new_page
=
{
{
...
@@ -221,8 +246,9 @@ sub load_page
...
@@ -221,8 +246,9 @@ sub load_page
local
*FPDS
=
$self
->
{'
__FPDS__
'};
local
*FPDS
=
$self
->
{'
__FPDS__
'};
my
$page_size
=
$self
->
{
page_size
};
my
$page_size
=
$self
->
{
page_size
};
# $self->debug_hdr();
my
$rc
=
seek
(
FPDS
,
$page_pos
,
0
);
my
$rc
=
seek
(
FPDS
,
$page_pos
,
0
);
# print
"seek: rc=[$rc]\n"
;
# print
f ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page_pos, $rc)
;
my
$new_buffer
;
my
$new_buffer
;
my
$bc
=
sysread
(
FPDS
,
$new_buffer
,
$page_size
);
my
$bc
=
sysread
(
FPDS
,
$new_buffer
,
$page_size
);
unless
(
$bc
==
$page_size
)
unless
(
$bc
==
$page_size
)
...
@@ -271,7 +297,7 @@ sub flush_page
...
@@ -271,7 +297,7 @@ sub flush_page
my
(
$page
,
$page_num
)
=
map
{
$self
->
{
$_
}
}
qw(last_page last_page_num)
;
my
(
$page
,
$page_num
)
=
map
{
$self
->
{
$_
}
}
qw(last_page last_page_num)
;
#
print
"
flushing page_num=[$page_num]\n";
print
'
=
'
x72
,
"
\n
flushing page_num=[
$page_num
]
\n
"
if
(
$DEBUG
>
1
)
;
return
undef
unless
(
$page_num
>=
0
&&
defined
(
$page
));
return
undef
unless
(
$page_num
>=
0
&&
defined
(
$page
));
# print "TODO: writing data page_num=[$page_num]\n";
# print "TODO: writing data page_num=[$page_num]\n";
...
@@ -284,8 +310,9 @@ sub flush_page
...
@@ -284,8 +310,9 @@ sub flush_page
my
@d
=
@
{
$page
->
{
dirty
}};
my
@d
=
@
{
$page
->
{
dirty
}};
my
$b
=
$page
->
{
buffer
};
my
$b
=
$page
->
{
buffer
};
# my $cnt_dirty= @d;
my
$cnt_dirty
=
@d
;
# print "flush: page_num=[$page_num] cnt_dirty=[$cnt_dirty]\n";
print
"
flush: page_num=[
$page_num
] cnt_dirty=[
$cnt_dirty
]
\n
"
if
(
$DEBUG
>
1
);
# $self->debug_hdr();
my
$new_buffer
=
$self
->
setup_header
(
$page_num
,
0x12345678
);
my
$new_buffer
=
$self
->
setup_header
(
$page_num
,
0x12345678
);
# print "new_buffer length=[",length($new_buffer), "]\n";
# print "new_buffer length=[",length($new_buffer), "]\n";
...
@@ -325,8 +352,9 @@ sub flush_page
...
@@ -325,8 +352,9 @@ sub flush_page
}
}
local
*FPDS
=
$self
->
{'
__FPDS__
'};
local
*FPDS
=
$self
->
{'
__FPDS__
'};
# $self->debug_hdr();
my
$rc
=
seek
(
FPDS
,
$page
->
{
page_pos
},
0
);
my
$rc
=
seek
(
FPDS
,
$page
->
{
page_pos
},
0
);
# print
"seek: rc=[$rc]\n"
;
# print
f ("%d seek: pos=[0x%08lX] rc=[%d]\n", __LINE__, $page->{page_pos}, $rc)
;
my
$bc
=
syswrite
(
FPDS
,
$new_buffer
,
$page_size
);
my
$bc
=
syswrite
(
FPDS
,
$new_buffer
,
$page_size
);
unless
(
$bc
==
$page_size
)
unless
(
$bc
==
$page_size
)
{
{
...
...
This diff is collapsed.
Click to expand it.
wdq1.pl
+
34
−
8
View file @
02fe41dd
...
@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated
...
@@ -24,7 +24,7 @@ my $exp_bitmap= 0; # 1..does not work; 2..makes no sense, too sparsely populated
# not used my $LR_max_propid= 1930; # dump from 20150608
# not used my $LR_max_propid= 1930; # dump from 20150608
my
$seq
=
'
a
';
my
$seq
=
'
a
';
my
$date
=
'
2016-08-
16
';
# maybe a config file
is in order
to set up the defaults...
my
$date
=
'
2016-08-
22
';
# maybe a config file
should be used
to set up the defaults...
my
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
);
my
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
);
my
$upd_paths
=
0
;
my
$upd_paths
=
0
;
...
@@ -173,12 +173,16 @@ my %filters=
...
@@ -173,12 +173,16 @@ my %filters=
'
P361
'
=>
wdpf
('
P361
',
'
part of
',
1
),
'
P361
'
=>
wdpf
('
P361
',
'
part of
',
1
),
'
P1269
'
=>
wdpf
('
P1269
',
'
facet of
',
1
),
'
P1269
'
=>
wdpf
('
P1269
',
'
facet of
',
1
),
# person identifiers
# item identifer (persons, places, etc.)
'
P213
'
=>
wdpf
('
P213
',
'
ISNI
'),
# International Standard Name Identifier for an identity
'
P227
'
=>
wdpf
('
P227
',
'
GND identifier
'),
'
P227
'
=>
wdpf
('
P227
',
'
GND identifier
'),
'
P244
'
=>
wdpf
('
P244
',
'
LCAuth ID
'),
# Library of Congress ID for authority control (for books use P1144)
'
P1245
'
=>
wdpf
('
P1245
',
'
OmegaWiki Defined Meaning
'),
# "Defined Meaning" on the site OmegaWiki
# person identifiers
'
P214
'
=>
wdpf
('
P214
',
'
VIAF identifier
'),
'
P214
'
=>
wdpf
('
P214
',
'
VIAF identifier
'),
'
P496
'
=>
wdpf
('
P496
',
'
ORCID identifier
'),
'
P496
'
=>
wdpf
('
P496
',
'
ORCID identifier
'),
'
P2280
'
=>
wdpf
('
P2280
',
'
Austrian Parliament ID
'),
# identifier for an individual, in the Austrian Parliament's "Who's Who" database
'
P213
'
=>
wdpf
('
P213
',
'
ISNI
'),
# check
# personal data?
# personal data?
'
P569
'
=>
wdpf
('
P569
',
'
Date of birth
'),
'
P569
'
=>
wdpf
('
P569
',
'
Date of birth
'),
...
@@ -186,17 +190,19 @@ my %filters=
...
@@ -186,17 +190,19 @@ my %filters=
'
P2298
'
=>
wdpf
('
P2298
',
'
NSDAP membership number (1925-1945)
'),
'
P2298
'
=>
wdpf
('
P2298
',
'
NSDAP membership number (1925-1945)
'),
# publications
# publications
'
P345
'
=>
wdpf
('
P345
',
'
IMDb identifier
'),
'
P212
'
=>
wdpf
('
P212
',
'
ISBN-13
'),
'
P212
'
=>
wdpf
('
P212
',
'
ISBN-13
'),
'
P236
'
=>
wdpf
('
P212
',
'
ISSN
'),
'
P236
'
=>
wdpf
('
P212
',
'
ISSN
'),
'
P345
'
=>
wdpf
('
P345
',
'
IMDb identifier
'),
'
P356
'
=>
wdpf
('
P356
',
'
DOI
'),
'
P698
'
=>
wdpf
('
P698
',
'
PubMed ID
'),
# identifier for journal articles/abstracts in PubMed
'
P957
'
=>
wdpf
('
P957
',
'
ISBN-10
'),
'
P957
'
=>
wdpf
('
P957
',
'
ISBN-10
'),
'
P3035
'
=>
wdpf
('
P3035
',
'
ISBN publisher prefix
'),
# ISBN publisher prefix
# arXiv.org
# arXiv.org
'
P818
'
=>
wdpf
('
P818
',
'
arXiv ID
'),
'
P818
'
=>
wdpf
('
P818
',
'
arXiv ID
'),
'
P820
'
=>
wdpf
('
P820
',
'
arXiv classification
'),
'
P820
'
=>
wdpf
('
P820
',
'
arXiv classification
'),
# permanent identifiers
# permanent identifiers
'
P356
'
=>
wdpf
('
P356
',
'
DOI
'),
'
P1184
'
=>
wdpf
('
P1184
',
'
Handle
'),
'
P1184
'
=>
wdpf
('
P1184
',
'
Handle
'),
'
P727
'
=>
wdpf
('
P727
',
'
Europeana ID
'),
'
P727
'
=>
wdpf
('
P727
',
'
Europeana ID
'),
'
P1036
'
=>
wdpf
('
P1036
',
'
Dewey Decimal Classification
'),
'
P1036
'
=>
wdpf
('
P1036
',
'
Dewey Decimal Classification
'),
...
@@ -217,8 +223,10 @@ my %filters=
...
@@ -217,8 +223,10 @@ my %filters=
'
P436
'
=>
wdpf
('
P436
',
'
MusicBrainz release group id
'),
'
P436
'
=>
wdpf
('
P436
',
'
MusicBrainz release group id
'),
'
P1004
'
=>
wdpf
('
P1004
',
'
MusicBrainz place id
'),
'
P1004
'
=>
wdpf
('
P1004
',
'
MusicBrainz place id
'),
#
misc.
#
Geography
'
P625
'
=>
wdpf
('
P625
',
'
Geo Coordinates
'),
'
P625
'
=>
wdpf
('
P625
',
'
Geo Coordinates
'),
'
1566
'
=>
wdpf
('
P1566
',
'
GeoNames ID
'),
'
P964
'
=>
wdpf
('
P964
',
'
Austrian municipality key
'),
# identifier for municipalities in Austria
# chemistry
# chemistry
'
P233
'
=>
wdpf
('
P233
',
'
SMILES
'),
# Simplified Molecular Input Line Entry Specification
'
P233
'
=>
wdpf
('
P233
',
'
SMILES
'),
# Simplified Molecular Input Line Entry Specification
...
@@ -240,6 +248,24 @@ my %filters=
...
@@ -240,6 +248,24 @@ my %filters=
'
P1072
'
=>
wdpf
('
P1072
'
=>
'
readable file format
'),
'
P1072
'
=>
wdpf
('
P1072
'
=>
'
readable file format
'),
'
P1073
'
=>
wdpf
('
P1073
'
=>
'
writable file format
'),
'
P1073
'
=>
wdpf
('
P1073
'
=>
'
writable file format
'),
'
P1195
'
=>
wdpf
('
P1195
'
=>
'
file extension
'),
'
P1195
'
=>
wdpf
('
P1195
'
=>
'
file extension
'),
# external-id
'
P503
'
=>
wdpf
('
P503
'
=>
'
ISO standard
'),
# number of the ISO standard which normalizes the object
# URLs
'
P854
'
=>
wdpf
('
P854
'
=>
'
reference URL
'),
'
P856
'
=>
wdpf
('
P856
'
=>
'
official website
'),
'
P953
'
=>
wdpf
('
P953
'
=>
'
full text available at
'),
'
P973
'
=>
wdpf
('
P973
'
=>
'
described at URL
'),
'
P1019
'
=>
wdpf
('
P1019
'
=>
'
feed URL
'),
'
P1065
'
=>
wdpf
('
P1065
'
=>
'
archive URL
'),
'
P1324
'
=>
wdpf
('
P1324
'
=>
'
source code repository
'),
'
P1325
'
=>
wdpf
('
P1325
'
=>
'
external data available at
'),
'
P1401
'
=>
wdpf
('
P1401
'
=>
'
bug tracking system
'),
'
P1581
'
=>
wdpf
('
P1581
'
=>
'
official blog
'),
'
P2699
'
=>
wdpf
('
P2699
'
=>
'
URL
'),
# '' => wdpf ('' => ''),
);
);
my
@filters
=
sort
keys
%filters
;
my
@filters
=
sort
keys
%filters
;
...
...
This diff is collapsed.
Click to expand it.
wdq2.pl
+
4
−
2
View file @
02fe41dd
...
@@ -20,7 +20,7 @@ use Wiktionary::Utils;
...
@@ -20,7 +20,7 @@ use Wiktionary::Utils;
use
PDS
;
use
PDS
;
my
$seq
=
'
a
';
my
$seq
=
'
a
';
my
$date
=
'
2016-0
7-04
';
my
$date
=
'
2016-0
8-22
';
my
$lang
=
undef
;
my
$lang
=
undef
;
my
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
);
my
(
$fnm
,
$data_dir
,
$out_dir
)
=
WikiData::Utils::
get_paths
(
$date
,
$seq
);
my
$cmp_fnm_pattern
=
'
%s/wdq%05d.cmp
';
my
$cmp_fnm_pattern
=
'
%s/wdq%05d.cmp
';
...
@@ -127,7 +127,7 @@ sub scan_items
...
@@ -127,7 +127,7 @@ sub scan_items
# print "index: ", Dumper ($index);
# print "index: ", Dumper ($index);
my
(
$idx_id
,
$idx_fo_num
,
$idx_pos_beg
,
$idx_pos_end
)
=
map
{
$index
->
{
$_
}
}
qw(id fo_count fo_pos_beg fo_pos_end)
;
my
(
$idx_id
,
$idx_fo_num
,
$idx_pos_beg
,
$idx_pos_end
)
=
map
{
$index
->
{
$_
}
}
qw(id fo_count fo_pos_beg fo_pos_end)
;
print
"
idx_id=[
$idx_id
] idx_fo_num=[
$idx_fo_num
] idx_pos_beg=[
$idx_pos_beg
] idx_pos_end=[
$idx_pos_end
]
\n
";
#
print "idx_id=[$idx_id] idx_fo_num=[$idx_fo_num] idx_pos_beg=[$idx_pos_beg] idx_pos_end=[$idx_pos_end]\n";
my
$columns
=
$csv
->
{'
columns
'};
my
$columns
=
$csv
->
{'
columns
'};
# print "columns: ", Dumper ($columns);
# print "columns: ", Dumper ($columns);
...
@@ -280,6 +280,7 @@ sub get_items
...
@@ -280,6 +280,7 @@ sub get_items
my
$cnt_items
=
0
;
my
$cnt_items
=
0
;
foreach
my
$rec_num
(
sort
{
$a
<=>
$b
}
@rec_nums
)
foreach
my
$rec_num
(
sort
{
$a
<=>
$b
}
@rec_nums
)
{
{
print
"
rec_num=[
$rec_num
]
\n
";
my
$data
=
$pds
->
retrieve
(
$rec_num
);
my
$data
=
$pds
->
retrieve
(
$rec_num
);
# main::hexdump ($data);
# main::hexdump ($data);
my
(
$x_rec_num
,
$pos_idx
,
$f_num
,
$beg
,
$end
,
@x
)
=
unpack
('
LLLLLLLL
',
$data
);
my
(
$x_rec_num
,
$pos_idx
,
$f_num
,
$beg
,
$end
,
@x
)
=
unpack
('
LLLLLLLL
',
$data
);
...
@@ -292,6 +293,7 @@ sub get_items
...
@@ -292,6 +293,7 @@ sub get_items
fo_pos_beg
=>
$beg
,
fo_pos_beg
=>
$beg
,
fo_pos_end
=>
$end
,
fo_pos_end
=>
$end
,
};
};
print
"
row:
",
Dumper
(
$row
);
if
(
$x_rec_num
>
0
)
if
(
$x_rec_num
>
0
)
{
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment