Skip to content
Snippets Groups Projects
Commit 66210841 authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

tracking new dumps

parent 93df63d0
No related branches found
No related tags found
No related merge requests found
wdq0.pl 0 → 100755
#!/usr/bin/perl
use strict;
use lib 'lib';
use FileHandle;
use JSON;
use Compress::Zlib;
use Util::Simple_CSV;
use Util::hexdump;
use Data::Dumper;
$Data::Dumper::Indent= 1;
use WikiData::Utils;
use Wiktionary::Utils;
use PDS;
my $dumps_source= 'https://dumps.wikimedia.org/other/wikidata/';
my $wget= '/usr/bin/wget';
my $seq= 'a';
my $date= '2017-04-10';
my $expected_size= 11605714337;
my $lang= undef;
my ($fnm, $data_dir, $out_dir)= WikiData::Utils::get_paths ($date, $seq);
my $cmp_fnm_pattern= '%s/wdq%05d.cmp';
# my $op_mode= 'find_items';
my $op_mode= 'get_items';
my $upd_paths= 0;
autoflush STDOUT 1;
my @PARS= ();
while (my $arg= shift (@ARGV))
{
if ($arg eq '--') { push (@PARS, @ARGV); @ARGV=(); }
elsif ($arg =~ /^--(.+)/)
{
my ($an, $av)= split ('=', $1, 2);
print "an=[$an] av=[$av]\n";
if ($an eq 'date') { $date= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'seq') { $seq= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'lang') { $lang= $av || shift (@ARGV); $upd_paths= 1; }
elsif ($an eq 'scan') { $op_mode= 'scan'; }
else
{
usage();
}
}
elsif ($arg =~ /^-(.+)/)
{
foreach my $flag (split('', $1))
{
usage();
}
}
else { push (@PARS, $arg); }
}
while (1)
{
my $dumps= check();
# print "dumps: ", Dumper ($dumps);
foreach my $dump (@$dumps)
{
fetch_and_convert ($dump->{date}, $seq, $dump->{size});
}
print scalar localtime (time()), " sleeping...\n";
sleep (3600);
}
exit (0);
sub fetch_and_convert
{
my $date= shift;
my $seq= shift;
my $expected_size= shift;
my $data_dir= sprintf ("data/%s%s", $date, $seq);
print "date=[$date] data_dir=[$data_dir]\n";
if (-d $data_dir)
{
print "data_dir=[$data_dir] is already present\n";
}
else
{
print "fetching stuff for $date\n";
my ($fetched, $dump_file)= fetch_dump ($date);
if ($fetched)
{
if ($fetched == $expected_size)
{
print "NOTE: fetched file seems good\n";
}
elsif ($fetched < $expected_size)
{
print "WARNING: fetch in progress? expected_size=[$expected_size] actual_size=[$fetched]; skipping...\n";
return undef;
}
else
{
print "ERROR: fetched file too big? expected_size=[$expected_size] actual_size=[$fetched]";
return undef;
}
}
unless (defined ($dump_file))
{
print "ERROR: dump_file=[$dump_file] not available\n";
return undef;
}
my @cmd1= (qw(./wdq1.pl --date), $date);
print "cmd1: [", join (' ', @cmd1), "]\n";
system (@cmd1);
my @cmd2= (qw(./wdq2.pl --scan --date), $date);
print "cmd2: [", join (' ', @cmd2), "]\n";
system (@cmd2);
my @cmd3= (qw(./wdq3.pl --date), $date);
print "cmd3: [", join (' ', @cmd3), "]\n";
system (@cmd3);
}
}
sub fetch_dump
{
my $d= shift;
$d=~ s#\-##g;
my $dump_file= $d.'.json.gz';
my $l_dump_file= 'dumps/'. $dump_file;
print "dump_file=[$dump_file] l_dump_file=[$l_dump_file]\n";
unless (-f $l_dump_file)
{
my $dump_url= $dumps_source . $dump_file;
my @cmd_fetch= ($wget, $dump_url, '-O'.$l_dump_file);
print "cmd_fetch: [", join (' ', @cmd_fetch), "]\n";
# return undef;
# system (@cmd_fetch);
}
my @st= stat ($l_dump_file);
my $fetched;
if (@st)
{
$fetched= $st[7];
}
($fetched, $dump_file);
}
sub check
{
my $cmd_fetch= "$wget $dumps_source -O-";
print "cmd_fetch=[$cmd_fetch]\n";
open (LST, '-|', $cmd_fetch) or die "can't run $cmd_fetch";
my @res;
while (<LST>)
{
chop;
if (m#<a href="((\d{4})(\d{2})(\d{2})\.json\.gz)">(\d{8}\.json\.gz)</a>\s+(\S+)\s+(\S+)\s+(\d+)#)
{
my ($f1, $year, $mon, $day, $f2, $xdate, $time, $size)= ($1, $2, $3, $4, $5, $6, $7, $8);
print "year=[$year] mon=[$mon] day=[$day] f1=[$f1] f2=[$f2] xdate=[$xdate] time=[$time] size=[$size]\n";
my $rec=
{
dump_file => $f1,
date => join ('-', $year, $mon, $day),
size => $size,
};
push (@res, $rec);
}
}
(wantarray) ? @res : \@res;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment