Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
wikidata-dump-processor
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Gerhard Gonter
wikidata-dump-processor
Commits
605effa5
Commit
605effa5
authored
Feb 19, 2019
by
Gerhard Gonter
Browse files
Options
Downloads
Patches
Plain Diff
debugging and messaging changed; added properties to extract
parent
42fff9f9
No related branches found
No related tags found
No related merge requests found
Pipeline
#2
failed
Feb 19, 2019
Stage: build
Stage: test
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
wdq0.pl
+8
-4
8 additions, 4 deletions
wdq0.pl
wdq1.pl
+224
-178
224 additions, 178 deletions
wdq1.pl
with
232 additions
and
182 deletions
wdq0.pl
+
8
−
4
View file @
605effa5
...
...
@@ -70,7 +70,7 @@ notify('starting wdq0 loop');
while
(
1
)
{
my
$dumps
=
check_dump
();
#
print "dumps: ", Dumper ($dumps);
print
"
dumps:
",
Dumper
(
$dumps
);
foreach
my
$dump
(
@$dumps
)
{
fetch_and_convert
(
$dump
->
{
date
},
$seq
,
$dump
->
{
size
});
...
...
@@ -87,7 +87,9 @@ sub notify
{
my
$msg
=
shift
;
system
(
qw(notify-sms.pl gg-uni)
,
$msg
);
print
"
NOTIFY: [
$msg
]
\n
";
system
(
qw(notify-sms.pl gg-uni)
,
scalar
localtime
(
time
()),
$msg
);
sleep
(
1
);
}
sub
fetch_and_convert
...
...
@@ -105,7 +107,7 @@ sub fetch_and_convert
}
else
{
print
"
fetching stuff for
$
date
\n
";
print
"
fetching stuff for date
=
$date
seq=
$seq
data_dir=[
$data_dir
]
\n
";
notify
("
wdq0: about to fetch dump for
$date
");
my
(
$fetched
,
$dump_file
)
=
fetch_dump
(
$date
);
...
...
@@ -195,13 +197,15 @@ sub check_dump
print
"
cmd_fetch=[
$cmd_fetch
]
\n
";
open
(
LST
,
'
-|
',
$cmd_fetch
)
or
die
"
can't run
$cmd_fetch
";
my
@res
;
while
(
<
LST
>
)
LST:
while
(
<
LST
>
)
{
chop
;
if
(
m#<a href="((\d{4})(\d{2})(\d{2})\.json\.gz)">(\d{8}\.json\.gz)</a>\s+(\S+)\s+(\S+)\s+(\d+)#
)
{
my
(
$f1
,
$year
,
$mon
,
$day
,
$f2
,
$xdate
,
$time
,
$size
)
=
(
$
1
,
$
2
,
$
3
,
$
4
,
$
5
,
$
6
,
$
7
,
$
8
);
print
"
year=[
$year
] mon=[
$mon
] day=[
$day
] f1=[
$f1
] f2=[
$f2
] xdate=[
$xdate
] time=[
$time
] size=[
$size
]
\n
";
next
LST
if
(
$size
<=
63
);
next
LST
if
(
$size
<=
30_000_000_000
);
my
$rec
=
{
dump_file
=>
$f1
,
...
...
This diff is collapsed.
Click to expand it.
wdq1.pl
+
224
−
178
View file @
605effa5
...
...
@@ -103,11 +103,10 @@ sub analyze_wikidata_dump
{
my
$fnm
=
shift
;
open
(
DIAG
,
'
>:utf8
',
'
@diag
')
or
die
;
# statistics
my
%types
;
my
%attrs
;
my
%count_snaktype
;
# item statistics
my
%lang_labels
;
...
...
@@ -118,6 +117,20 @@ my %name_sitelinks;
my
%props
;
unless
(
-
d
$data_dir
)
{
print
"
mkdir
$data_dir
\n
";
mkdir
(
$data_dir
);
}
unless
(
-
d
$out_dir
)
{
print
"
mkdir
$out_dir
\n
";
mkdir
(
$out_dir
)
}
my
$diag_file
=
$data_dir
.
'
/@diag
';
open
(
DIAG
,
'
>:utf8
',
$diag_file
)
or
die
"
can't write diag file=[
$diag_file
]
";
my
@item_attrs
=
qw(labels descriptions aliases claims sitelinks)
;
my
$running
=
1
;
...
...
@@ -137,17 +150,6 @@ my %props;
my
$line
=
0
;
my
$t_start
=
time
();
unless
(
-
d
$data_dir
)
{
print
"
mkdir
$data_dir
\n
";
mkdir
(
$data_dir
);
}
unless
(
-
d
$out_dir
)
{
print
"
mkdir
$out_dir
\n
";
mkdir
(
$out_dir
)
}
# item list
my
$fnm_items
=
$data_dir
.
'
/items.csv
';
...
...
@@ -155,7 +157,7 @@ local *FO_ITEMS;
open
(
FO_ITEMS
,
'
>:utf8
',
$fnm_items
)
or
die
"
can't write to [
$fnm_items
]
";
my
@cols1
=
qw(line pos fo_count fo_pos_beg fo_pos_end id type cnt_label cnt_desc cnt_aliases cnt_claims cnt_sitelink lang label)
;
print
FO_ITEMS
join
(
$TSV_SEP
,
@cols1
,
qw(filtered_props claims)
),
"
\n
";
autoflush
FO_ITEMS
1
;
#
autoflush FO_ITEMS 1;
# properties
my
@cols_filt
=
(
@cols1
,
'
val
');
...
...
@@ -190,7 +192,9 @@ my %filters=
# person identifiers
'
P214
'
=>
wdpf
('
P214
',
'
VIAF identifier
'),
'
P496
'
=>
wdpf
('
P496
',
'
ORCID identifier
'),
'
P651
'
=>
wdpf
('
P651
',
'
Biografisch Portaal number
'),
# identifier at Biografisch Portaal van Nederland
'
P2280
'
=>
wdpf
('
P2280
',
'
Austrian Parliament ID
'),
# identifier for an individual, in the Austrian Parliament's "Who's Who" database
'
P3421
'
=>
wdpf
('
P3421
',
'
Belvedere artist ID
'),
# identifier assigned to an artist by the Österreichische Galerie Belvedere in Vienna
# personal data?
'
P569
'
=>
wdpf
('
P569
',
'
Date of birth
'),
...
...
@@ -229,10 +233,31 @@ my %filters=
'
P434
'
=>
wdpf
('
P434
',
'
MusicBrainz artist id
'),
'
P435
'
=>
wdpf
('
P435
',
'
MusicBrainz work id
'),
'
P436
'
=>
wdpf
('
P436
',
'
MusicBrainz release group id
'),
'
P966
'
=>
wdpf
('
P966
',
'
MusicBrainz label ID
'),
'
P982
'
=>
wdpf
('
P982
',
'
MusicBrainz area ID
'),
'
P1004
'
=>
wdpf
('
P1004
',
'
MusicBrainz place id
'),
# BookBrainz
'
P1407
'
=>
wdpf
('
P1407
',
'
MusicBrainz series id
'),
'
P4404
'
=>
wdpf
('
P4404
',
'
MusicBrainz recording id
'),
'
P5813
'
=>
wdpf
('
P5813
',
'
MusicBrainz release id
'),
# AllMusic
'
P1728
'
=>
wdpf
('
P1728
',
'
AllMusic artist ID
'),
'
P1729
'
=>
wdpf
('
P1728
',
'
AllMusic album ID
'),
'
P1730
'
=>
wdpf
('
P1730
',
'
AllMusic song ID
'),
'
P1994
'
=>
wdpf
('
P1994
',
'
AllMusic composition ID
'),
'
P6110
'
=>
wdpf
('
P6110
',
'
AllMusic release ID
'),
'
P6306
'
=>
wdpf
('
P6306
',
'
AllMusic performance ID
'),
# Google Play Music
'
P4198
'
=>
wdpf
('
P4198
',
'
Google Play Music artist ID
'),
'
P4199
'
=>
wdpf
('
P4199
',
'
Google Play Music album ID
'),
# Amazon Music database
'
P6276
'
=>
wdpf
('
P6276
',
'
Amazon Music artist ID
'),
# Books
'
P2607
'
=>
wdpf
('
P2607
',
'
BookBrainz creator ID
'),
# identifier for a creator per the BookBrainz open book encyclopedia
'
P123
'
=>
wdpf
('
P123
',
'
publisher
'),
# organization or person responsible for publishing books, periodicals, games or software
# WorldCat
'
P2163
'
=>
wdpf
('
P163
',
'
FAST-ID
'),
# authority control identifier in WorldCat's “FAST Linked Data” authority file
...
...
@@ -282,6 +307,10 @@ my %filters=
'
P1581
'
=>
wdpf
('
P1581
'
=>
'
official blog
'),
'
P2699
'
=>
wdpf
('
P2699
'
=>
'
URL
'),
# other person identifiers
'
P5246
'
=>
wdpf
('
P5246
'
=>
'
Pornhub ID
'),
'
P5267
'
=>
wdpf
('
P5267
'
=>
'
YouPorn ID
'),
'
P5540
'
=>
wdpf
('
P5540
'
=>
'
RedTube ID
'),
# '' => wdpf ('' => ''),
);
my
@filters
=
sort
keys
%filters
;
...
...
@@ -498,19 +527,35 @@ my $fo_count= $fo_rec->open();
my
$p
=
$jc
->
{
$property
};
# print "p: ", Dumper ($p);
my
$ms
;
eval
{
$ms
=
$p
->
[
0
]
->
{
mainsnak
}
};
if
(
$@
)
{
print
DIAG
"
id=
$id
ERROR: no mainsnak element; property=[
$property
] e=[$@] property=
",
Dumper
(
$p
);
next
PROP
;
}
my
$snaktype
=
$ms
->
{
snaktype
};
$count_snaktype
{
$snaktype
}
++
;
if
(
$snaktype
ne
'
value
')
{
print
DIAG
"
id=
$id
NOTE: snaktype=[
$snaktype
], property=[
$property
]
\n
";
next
PROP
;
}
my
$x
;
eval
{
$x
=
$
p
->
[
0
]
->
{'
mainsnak
'}
->
{
'
datavalue
'
}
->
{
'
value
'
}
};
eval
{
$x
=
$
ms
->
{
datavalue
}
->
{
value
}
};
# print "x: ", Dumper ($x); # exit;
if
(
$@
)
{
print
DIAG
"
id=
$id
error:
property=[
$property
]
$x
=[
$x
] e=[$@] property=
",
Dumper
(
$p
);
print
DIAG
"
id=
$id
ERROR: no value element;
property=[
$property
]
$x
=[
$x
] e=[$@] property=
",
Dumper
(
$p
);
next
PROP
;
}
elsif
(
!
defined
(
$x
))
{
print
DIAG
"
id=
$id
undef x
: property=[
$property
] property=
",
Dumper
(
$p
);
print
DIAG
"
id=
$id
NOTE: undef property value
: property=[
$property
] property=
",
Dumper
(
$p
);
next
PROP
;
}
...
...
@@ -633,6 +678,7 @@ my $fo_count= $fo_rec->open();
print
STATS
"
lines:
$line
\n
";
print
STATS
"
fo_count:
$fo_count
\n
";
print
STATS
"
cnt_authctrl:
$cnt_authctrl
\n
";
print
STATS
"
snaktypes:
",
Dumper
(
\
%count_snaktype
);
}
if
(
$exp_bitmap
==
1
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment