diff --git a/eprints1.pl b/eprints1.pl index c09463fed9104801817c318ae006cec6b40d9e45..bef6e3a09cab265f7f955f750b3554eae874f978 100755 --- a/eprints1.pl +++ b/eprints1.pl @@ -117,8 +117,9 @@ my $op_mode= 'unknown'; # my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize my $ot2ut_context= 'ot2ut-test'; # TODO: parametrize # my $ot2ut_context= 'ot2ut-prod'; # TODO: parametrize -my $oma_sleep_time= 10; -my $MAX_BLOCK= 690; +my $oma_sleep_time= 5; +my $activity_period= 12; +my $MAX_BLOCK= 699; my %map_ot2ut_roles= ( @@ -182,7 +183,7 @@ my $flag_add_identifiers= 1; my %bucketlist_column_descriptions= ( bucket => 'Kategorie bzw. Klassifizierung - wird dynamisch erstellt - somit keine ID', - eprint_status => 'Status [Archiv/Buffer]', + eprint_status => 'Status [archive/buffer/etc]', einverstaendnis => 'Werknutzung [TRUE/FALSE/NULL=nicht vorhanden]', sperre => 'Sperre nach ยง86 gesetzt [TRUE/FALSE/NULL=nicht vorhanden]', hds => 'Has Date Sperre [future/past/no]', @@ -414,6 +415,7 @@ while (defined ($arg= shift (@ARGV))) elsif ($opt eq 'mab-age') { $MAX_MAB_AGE= $val || shift (@ARGV); } # in seconds elsif ($opt eq 'marc-age'){ $MAX_MARC_AGE= $val || shift (@ARGV); } # in seconds elsif ($opt eq 'export-eprints') { $op_mode= 'export-eprints'; } + elsif ($opt eq 'export-migration') { $op_mode= 'export-migration'; } elsif ($opt eq 'reset') { $op_mode= $opt; } elsif ($opt eq 'force') { $force= defined($val) ? $val : 1; } elsif ($opt eq 'upload') { $do_upload= defined($val) ? $val : 1; } @@ -452,7 +454,6 @@ my $db_ot2ut; my $col_msg; my $col_activity; my $last_activity= 0; -my $activity_period= 60; my $cnf= Util::JSON::read_json_file ($config_fnm); @@ -544,6 +545,10 @@ elsif ($op_mode eq 'export-eprints') # WIP export_csv ($irma_na, 'eprints'); } +elsif ($op_mode eq 'export-migration') # WIP +{ + export_migration_data(); +} elsif ($op_mode eq 'doi4ep') # WIP, started 2019-06-18 { doi4ep(); @@ -2408,6 +2413,12 @@ print __LINE__, " verify1_urn_registration: context=[$context]\n"; } } +=head2 export_csv + +export data from irma db + +=cut + sub export_csv { my $irma_na= shift; @@ -2688,6 +2699,7 @@ sub oma print __LINE__, " oma: row: ", Dumper($row); my $new_status= 'failed'; + my $next_action; if ($row->{action} eq 'send_batch') { # allow batch sizes up to 1000; be silent for batch sizes bigger than 10 items my $bs= $row->{batch_size}; @@ -2727,8 +2739,13 @@ sub oma send_message("send_block: block $block, result: $res"); $new_status= 'done' if (@$synced); - - policies_stats("processed block $block in context $ot2ut_context"); + # policies_stats("processed block $block in context $ot2ut_context"); + } + elsif ($row->{action} eq 'policies_stats') + { + $col_req->update({ _id => $row->{_id}}, { '$set' => { status => 'in_progress', ts_start => Util::ts::ts_ISO_gmt() }}); + policies_stats(); + $new_status= 'done'; } elsif ($row->{action} eq 'send_ids') { @@ -2749,10 +2766,29 @@ sub oma } elsif ($row->{action} eq 'reload') { - exec($0); + activity({ activity => 'reloading' }); + send_message('reloading'); + $next_action= 'reload'; + $new_status= 'done'; + } + elsif ($row->{action} eq 'stop') + { + activity({ activity => 'stopping' }); + send_message('exiting'); + $next_action= 'exit'; + $new_status= 'done'; } + # update job $col_req->update({ _id => $row->{_id}}, { '$set' => { status => $new_status, ts_finish => Util::ts::ts_ISO_gmt() }}); + + if (defined ($next_action)) + { + if ($next_action eq 'reload') { exec($0); } + elsif ($next_action eq 'exit') { exit(0); } + $next_action= undef; + } + activity({ activity => 'listening', msg => ": context $ot2ut_context" }); } } @@ -3370,8 +3406,8 @@ sub generate_utheses_metadata print __LINE__, " docs: ", Dumper($docs); print __LINE__, " main_file: ", Dumper($main_file); - my $utheses_json_path= 'othes/utheses_json/' . $eprintid . '.json'; - my $utheses_upload_result_json_path= 'othes/utheses_json/' . $eprintid . '_upload_result.json'; + my $utheses_json_path= 'othes/utheses_json/' . $ot2ut_context . '/' . $eprintid . '.json'; + my $utheses_upload_result_json_path= 'othes/utheses_json/' . $ot2ut_context . '/' . $eprintid . '_upload_result.json'; my $lastmod= sprintf("%4d-%02d-%02dT%02d%02d%02d", map { $row->{$_} } qw(lastmod_year lastmod_month lastmod_day lastmod_hour lastmod_minute lastmod_second)); # my $lastmod= get_othes_timestamp($row, 'lastmod'); that's a different format: yyyy-mm-ddTHH:MM:SSZ @@ -4579,6 +4615,12 @@ sub get_redis_db undef; } +=head2 export_redirect + +export redis rediction data + +=cut + sub export_redirect { # $cnf is global @@ -5142,6 +5184,7 @@ sub policies_stats } # END annotations my $bucket_cnt= $cctab->show_tsv(['othes', @contexts], 'counters.tsv'); + my $now= scalar localtime(time()); my $msg_html= 'update'; my %stale_uploads= (); @@ -5425,13 +5468,13 @@ EOX <p>Objects, which were updated on othes after the upload to the target instance.</p> EOX - my $last_block; # my %stale_count; foreach my $ctx (@stale_contexts) { my $stu= $stale_uploads{$ctx}; print IDX "<h3>stale uploads context $ctx</h3>\n"; print IDX "<p>count: ", scalar(@$stu), "</p>\n"; + my $last_block= -1; foreach my $stale_object (@$stu) { my ($eprint_id, $utheses_id)= @$stale_object; @@ -5445,44 +5488,34 @@ EOX print IDX ' ', $eprint_id; # $stale_count{$ctx}++; } - print IDX "</p>\n" if (defined ($last_block)); + print IDX "</p>\n" if ($last_block >= 0); } # print IDX "stale counter: ", join(' ', %stale_count), "\n"; Util::JSON::write_json_file('stale_uploads.json', \%stale_uploads); } -=begin comment - if (exists ($incomplete_blocks{'entw'})) { my $ibe= $incomplete_blocks{'entw'}; print IDX "<h2>incomplete blocks in context entw</h2>\n"; - print IDX join(' ', @$ibe), "\n"; + print IDX join(' ', map { '<a href="http://xx2.test.univie.ac.at:3001/html/block/'.$_.'" target="opa">'.$_.'</a>' } @$ibe), "\n"; } -=end comment -=cut - if (exists ($incomplete_blocks{'test'})) { my $ibt= $incomplete_blocks{'test'}; print IDX "<h2>incomplete blocks in context test</h2>\n"; - print IDX join(' ', @$ibt), "\n"; + print IDX join(' ', map { '<a href="http://xx2.test.univie.ac.at:3001/html/block/'.$_.'" target="opa">'.$_.'</a>' } @$ibt), "\n"; } -=begin comment - if (exists ($incomplete_blocks{'prod'})) { my $ibp= $incomplete_blocks{'prod'}; print IDX "<h2>incomplete blocks in context prod</h2>\n"; - print IDX join(' ', @$ibp), "\n"; + print IDX join(' ', map { '<a href="http://xx2.test.univie.ac.at:3001/html/block/'.$_.'" target="opa">'.$_.'</a>' } @$ibp), "\n"; } -=end comment -=cut - Util::JSON::write_json_file('incomplete_blocks.json', \%incomplete_blocks); print IDX "<h2>errors</h2>\n". Dumper(\%cnt_errors); @@ -5511,6 +5544,116 @@ EOX Util::JSON::write_json_file('docs_with_holes.json', \%docs_with_holes); } +=head2 export_migration_data + +export table with data from utheses migration + +=cut + +sub export_migration_data +{ + $db_ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database') unless (defined ($db_ot2ut)); + + my %data; + + # pass 1: get data from sync db + my @sync_fields= qw(eprint_id eprint_status utheses_id ts_upload upload_status); + { + my $col_sync= $db_ot2ut->get_collection('sync'); + my $cur_sync= $col_sync->find({ context => $ot2ut_context }); + $cur_sync->fields( { map { $_ => 1 } @sync_fields } ); + + print __LINE__, " reading sync db\n"; + while ($running) + { + my $row_sync= $cur_sync->next(); + last unless (defined ($row_sync)); + # print __LINE__, " row_sync: ", Dumper($row_sync); + my %rec= map { $_ => $row_sync->{$_} } @sync_fields; + $data{$rec{eprint_id}}= \%rec; + } + } + + # pass 2: get data from utheses.policies + my @utp_fields= qw(eprint_id ac_nummer urn doi lastmod); + { + my $col_utp= $db_ot2ut->get_collection('utheses.policies'); + my $cur_utp= $col_utp->find(); + $cur_utp->fields( { map { $_ => 1 } @utp_fields } ); + print __LINE__, " reading utp\n"; + while ($running) + { + my $row_utp= $cur_utp->next(); + last unless (defined ($row_utp)); + my $id= $row_utp->{eprint_id}; + if (exists ($data{$id})) + { + my $rec= $data{$id}; + foreach my $f (@utp_fields) { $rec->{$f}= $row_utp->{$f}; } + } + } + } + + # 3: get data from alma.marc + my @marc_fields= qw(ac_number mms_id fetched lib_code); + my @marc_extra_fields= qw(marc_record ts_fetched); + { + my $db_marc= get_marc_db($cnf); + my $col_marc= $db_marc->get_collection('alma.marc'); + print __LINE__, " checking alma.marc\n"; + my $num= 0; + MARC: foreach my $eprint_id (keys %data) + { + my $rec= $data{$eprint_id}; + my $ac_nummer= $rec->{ac_nummer}; + + if ((++$num % 1000) == 0) + { + print __LINE__, " num=[$num] eprint_id=[$eprint_id] ac_nummer=[$ac_nummer]\n" + } + $rec->{marc_record}= 'no_ac_number'; + + next MARC unless (defined ($ac_nummer)); # no ac_number (or ac_nummer) no fun! + + my $marc= $col_marc->find_one({ ac_number => $ac_nummer }); + unless (defined ($marc)) + { + $rec->{marc_record}= 'not_found'; + next MARC; + } + + $rec->{marc_record}= 'found'; + $rec->{ts_fetched}= Util::ts::ts_ISO_gmt($marc->{fetched}); + foreach my $f (@marc_fields) { $rec->{$f}= $marc->{$f}; } + + my $mrd= $marc->{xmlref}->{records}->[0]->{record}->[0]->{recordData}->[0]->{record}->[0]; + unless (defined ($mrd)) + { + $rec->{marc_record}= 'marc_data_not_found'; + next MARC; + } + $rec->{marc_record}= 'marc_data_found'; + } + } + + # write tsv + { + my $tsv_fnm= sprintf ("sync_%s.tsv", $ot2ut_context); + print __LINE__, " writing migration table [$tsv_fnm]\n"; + + my @tsv_columns= (@sync_fields, @utp_fields, @marc_fields, @marc_extra_fields); # NOTE: eprint_id is there several times + open (TSV, '>:utf8', $tsv_fnm) or die; + print TSV join("\t", @tsv_columns), "\n"; + foreach my $eprint_id (sort { $a <=> $b } keys %data) + { + my $rec= $data{$eprint_id}; + print TSV join("\t", map { $rec->{$_} } @tsv_columns), "\n"; + } + close (TSV); + } + +} + sub cleanup_keywords { my $s= shift; @@ -5795,6 +5938,7 @@ sub bucket } # print __LINE__, " norm=[", join(':', @norm), "]\n"; + $p->{code}= join ('', map { substr($_,0,1); } @norm); $p; } @@ -5806,6 +5950,11 @@ sub show_tsv # TODO: rename ... my $fnm_counters= shift || 'counters.tsv'; # my $trailer= shift; + $db_ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database') unless (defined ($db_ot2ut)); + my $col_bc= $db_ot2ut->get_collection('bucket.counters'); + my $col_bl= $db_ot2ut->get_collection('bucket.lists'); + $col_bl->remove({}); + my @columns= @{$self->{columns}}; my $column_count= @columns; @@ -5815,7 +5964,7 @@ sub show_tsv # TODO: rename ... enumerate(\@rows, $b, $column_count, []); my @counters= @$counters; - my @heading= ('bucket', @columns, @counters); + my @heading= ('bucket', 'code', @columns, @counters); my $idx_html= join('/', $self->{base_path}, 'buckets.html'); open (BUCKETS, '>:utf8', $idx_html) or die; @@ -5851,8 +6000,12 @@ EOX my ($vals, $bucket)= @$row; my @vals= @$vals; - print TSV join("\t", $bucket_nr, @vals, map { $bucket->{$_} } @counters), "\n"; - Util::JSON::write_json_file("bucket_${bucket_nr}.json", $bucket); + my $bucket_code= $bucket->{code}; + + print TSV join("\t", $bucket_nr, $bucket_code, @vals, map { $bucket->{$_} } @counters), "\n"; + Util::JSON::write_json_file("bucket_${bucket_code}.json", $bucket); + + $col_bl->update({ code => $bucket_code }, $bucket, { upsert => 1 }); my $annotation= $bucket->{annotation}; my $row_info; @@ -5866,7 +6019,8 @@ EOX } } - my $fnm_lst= sprintf("bucket_%d.html", $bucket_nr); + # my $fnm_lst= sprintf("bucket_%d.html", $bucket_nr); + my $fnm_lst= sprintf("bucket_%s.html", $bucket_code); my $path_lst= join ('/', $self->{base_path}, $fnm_lst); open (LST, '>:utf8', $path_lst) or die; print LST <<"EOX"; @@ -5879,6 +6033,7 @@ EOX </style> </head> <body> +<p>code: $bucket_code</p> <table> EOX @@ -5965,7 +6120,11 @@ EOX close (LST); print BUCKETS "<tr$row_info><td><a href=\"$fnm_lst\" target=\"bucket\">$bucket_nr</a></td>"; - foreach my $val (@$vals) { print BUCKETS "<td>$val</td>"; } + print BUCKETS "<td>", $bucket_code, "</td>"; + foreach my $val (@$vals) + { + print BUCKETS "<td>$val</td>"; + } ctr: foreach my $ctr (@counters) {