diff --git a/eprints1.pl b/eprints1.pl index 70d3705df8611ce8f99fb319c7721b6e2f389292..b03748d318e235a7570823e77c301c68160a7cea 100755 --- a/eprints1.pl +++ b/eprints1.pl @@ -169,7 +169,8 @@ my $ignore_errors= 0; if ($0 eq './ot2ut.pl') { $op_mode= 'ot2ut'; $do_upload= 1; $MAX_SYNC= 1; } if ($0 eq './oma.pl') { $op_mode= 'oma'; $do_upload= 1; } -# END OT2UT: Othesis to Utheses +my %doc_embargo_dates; +# END OT2UT: Othes to Utheses # ====================================================================== my @db_tables= (); @@ -219,6 +220,8 @@ while (defined ($arg= shift (@ARGV))) elsif ($opt eq 'debug_stkz') { $op_mode= 'debug_stkz'; } elsif ($opt eq 'debug_stbez') { $op_mode= 'debug_stbez'; } elsif ($opt eq 'debug_filenames') { $op_mode= 'debug_filenames'; } + elsif ($opt eq 'update-policies') { $op_mode= 'update-policies'; } + elsif ($opt eq 'policies-stats') { $op_mode= 'policies-stats'; } elsif ($opt eq 'max') { $MAX_SYNC= $val || shift (@ARGV); } elsif ($opt eq 'mab-age') { $MAX_MAB_AGE= $val || shift (@ARGV); } # in seconds elsif ($opt eq 'marc-age'){ $MAX_MARC_AGE= $val || shift (@ARGV); } # in seconds @@ -391,6 +394,10 @@ elsif ($op_mode eq 'debug_filenames') { debug_filenames(); } +elsif ($op_mode eq 'policies-stats') +{ + policies_stats(); +} elsif ($op_mode eq 'reset') # reset error conditions for given ac_numbers { reset_errors(@PARS); @@ -2329,7 +2336,11 @@ sub get_othes_timestamp foreach my $el (qw(year month day)) { my $x= $row->{$name . '_' . $el}; - return undef unless (defined ($x)); + unless (defined ($x)) + { + return undef if ($el eq 'year'); + $x= 1; # undef month or day becomes 1 + } push (@ts, $x); } @@ -2475,10 +2486,11 @@ sub oma my $new_status= 'failed'; if ($row->{action} eq 'send_batch') - { + { # allow batch sizes up to 1000; be silent for batch sizes bigger than 10 items my $bs= $row->{batch_size}; - $bs= 10 unless ($bs > 0 && $bs <= 100); + $bs= 10 unless ($bs > 0 && $bs <= 1000); $MAX_SYNC= $bs; + $silent_upload_success= ($bs > 10) ? 1 : 0; my $eprint_status= $row->{eprint_status}; @@ -2506,6 +2518,7 @@ sub oma my $msg= "send_ids: sending $cnt objects to $ot2ut_context"; activity({ activity => 'send_batch', msg => $msg}); send_message($msg); + $silent_upload_success= 0; my ($synced, $res)= ot2ut(@{$row->{ids}}); send_message("send_ids: $res"); @@ -2577,6 +2590,7 @@ sub ot2ut # print __LINE__, " utheses_faculty_map: ", Dumper($utheses_faculty_map); exit; } + # find items to upload unless (@eprint_ids) { print __LINE__, " fetching data\n"; @@ -2584,7 +2598,7 @@ sub ot2ut if ($ot2ut_eprint_status eq 'archive') { - $res1= $epr->fetch_data('archive'); + $res1= $epr->fetch_data('archive', { doi => 0 }); } elsif ($ot2ut_eprint_status eq 'buffer') { @@ -2613,7 +2627,7 @@ sub ot2ut last unless ($running); last if (defined ($MAX_SYNC) && $cnt_synced >= $MAX_SYNC); - activity({ activity => 'ot2ut'}) if ($last_activity + $activity_period <= time()); + activity({ activity => 'ot2ut', eprint_id => $eprint_id }) if ($last_activity + $activity_period <= time()); my $t_start= time(); @@ -3073,6 +3087,99 @@ sub generate_utheses_metadata (\@errors, \@warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path); } +sub get_documents +{ + my $epr_db= shift; + my $eprint_id= shift; + + my $document_rows= $epr_db->get_all_x('document', ['eprintid=?', $eprint_id]); + # print __LINE__, " document_rows: ", Dumper($document_rows); + + my @documents; + my @notes; + my $res= + { + eprint_id => $eprint_id, + documents => \@documents, + cnt_docs => 0, + cnt_public => 0, + cnt_restricted => 0, + cnt_embargo => 0, + show => 0, # internal flag + notes => \@notes, + }; + + DOCUMENT: foreach my $document_id (keys %$document_rows) + { + my $row= $document_rows->{$document_id}; + + if ($row->{main} eq 'indexcodes.txt' || $row->{main} eq 'preview.png') + { # ignore these ... + next DOCUMENT; + } + + # print __LINE__, " document_row: ", Dumper($row); + my $idx= $row->{pos}; + if (defined ($row->{placement}) && $row->{pos} != $row->{placement}) + { + push (@notes, "pos != placement: pos=[$row->{pos}] placement=[$row->{placement}]"); + $res->{show}++; + $idx= $row->{placement}; + } + + $idx--; # NOTE: pos and placement start at 1 + # print __LINE__, " idx=[$idx]\n"; + if (defined($documents[$idx])) + { + push (@notes, "already a document at index=[$idx]"); + $res->{show}++; + push (@documents, $row); + } + elsif ($idx < 0) + { + push (@notes, "negative index=[$idx]"); + $res->{show}++; + push (@documents, $row); + } + else + { + $documents[$idx]= $row; + } + + $res->{cnt_docs}++; + if ($row->{security} eq 'public') + { + $res->{cnt_public}++; + } + else + { + $res->{cnt_restricted}++; + } + + if (defined ($row->{date_embargo_year})) + { + $res->{cnt_embargo}++; + $row->{date_embargo}= my $d= get_othes_timestamp($row, 'date_embargo'); + + if ($d eq '') + { + push (@notes, "embargo date empty"); + $res->{show}++; + } + + $doc_embargo_dates{$d}++; + } + } + + my $idx_first_public; + foreach my $doc (@documents) + { + + } + + $res; +} + sub get_history { my $epr_db= shift; @@ -3361,7 +3468,7 @@ mysql> select count(*), abstract_nicht_anzeigen from eprint group by abstract_ni $thesis{assessment_date}= ''; } -print __LINE__, " thesis: ", Dumper (\%thesis); + # print __LINE__, " thesis: ", Dumper (\%thesis); (\@errors, \@warnings, \%thesis); } @@ -3375,7 +3482,7 @@ sub get_names_for_role my $row= shift; my $column_names= shift; - print __LINE__, " column_names: ", Dumper($column_names); + # print __LINE__, " column_names: ", Dumper($column_names); my @errors= (); my @warnings= (); @@ -3456,7 +3563,8 @@ sub analyze_files my $rev_dir= sprintf("%02d", pop(@filepath)); my $path_pdf= join('/', '/backup/othes/eprints', @dirs, $rev_dir, $fnm); my $path_txt= join('/', '/backup/othes/eprints', @dirs, $rev_dir, join ('.', @fnm, 'txt')); - print __LINE__, " path_pdf=[$path_pdf] path_txt=[$path_txt]\n"; + print __LINE__, " path_pdf=[$path_pdf]\n"; + print __LINE__, " path_txt=[$path_txt]\n"; my @st_pdf= stat($path_pdf); my @st_txt= stat($path_txt); @@ -3731,7 +3839,7 @@ sub debug_stkz } } -sub debug_filenames +sub update_policies { my $epr= get_eprints_db($cnf); # print "epr: ", Dumper ($epr); @@ -3739,15 +3847,201 @@ sub debug_filenames my $epr_db= $epr->connect(); # print "epr_db: ", Dumper ($epr_db); - my @col_names= qw( eprintid fileinfo ); - $epr_db->show_query(1); + $db_ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database') unless (defined ($db_ot2ut)); + my $col_utp= $db_ot2ut->get_collection('utheses.policies'); + + my @col_names= qw( eprintid lastmod_year lastmod_month lastmod_day lastmod_hour lastmod_minute lastmod_second ); + # $epr_db->show_query(1); my $search_term= "eprint_status IN ('archive', 'buffer')"; my $keys= $epr_db->get_all_x('eprint', [$search_term], join(',', @col_names)); - foreach my $key (keys %$keys) + my $ts_start= Util::ts::ts_ISO_gmt(time()); + + my ($cnt_updated, $cnt_inserted, $cnt_unchanged)= (0, 0, 0, 0); + my (@lst_updated, @lst_inserted, @lst_unchanged); + + foreach my $eprint_id (keys %$keys) { - my $r= $keys->{$key}; - print __LINE__, " key=[$key] ", Dumper($r); + last unless ($running); + + my $x1= $keys->{$eprint_id}; + my $x1_lastmod= sprintf("%4d-%02d-%02dT%02d%02d%02d", map { $x1->{$_} } qw(lastmod_year lastmod_month lastmod_day lastmod_hour lastmod_minute lastmod_second)); + # print __LINE__, " x1: ", Dumper($x1); + + my $utp_info= $col_utp->find_one({ eprint_id => $eprint_id }); + + if (defined ($utp_info) && $utp_info->{lastmod} eq $x1_lastmod) + { + print __LINE__, " NOTE: already processed, no change: x1_lastmod=[$x1_lastmod] eprint_id=[$eprint_id]\n"; + $cnt_unchanged++; + push (@lst_unchanged, $eprint_id); + next; + } + print __LINE__, " eprint_id=[$eprint_id] x1_lastmod=[$x1_lastmod]\n"; + + print __LINE__, ' ', '='x70, "\n"; + my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= + generate_utheses_metadata($epr, $eprint_id); + + my $docs= get_documents($epr_db, $eprint_id); + my $show= $docs->{show}; + + my $cnt_errors= @$errors; + my $cnt_warnings= @$warnings; + + my $full_data= + { + eprint_id => $eprint_id, + lastmod => $lastmod, + date_sperre => get_othes_timestamp($row, 'date_sperre'), + generated => Util::ts::ts_ISO_gmt(time()), + + ut_public => $ut->{public}, + docs => $docs, + files => $files, + + cnt_errors => $cnt_errors, errors => $errors, + cnt_warnings => $cnt_warnings, warnings => $warnings, + + utheses_json_path => $utheses_json_path, + utheses_upload_result_json_path => $utheses_upload_result_json_path, + }; + + map { $full_data->{$_}= $row->{$_} } qw(eprint_status einverstaendnis sperre full_text_status); + + # $show++ if ($docs->{cnt_embargo} > 0); + # $show++ if ($docs->{cnt_public} > 0 && $docs->{cnt_restricted} > 0); + + if ($show) + { + # print __LINE__, " documents: eprint_id=[$eprint_id] show=[$show] ", Dumper($docs); + print __LINE__, " full_data: eprint_id=[$eprint_id] show=[$show] ", Dumper($full_data); + } + + if (defined ($utp_info)) + { + my $rc_upd= $col_utp->update( { _id => $utp_info->{_id} }, $full_data); + $cnt_updated++; + push (@lst_updated, $eprint_id); + + print __LINE__, " rc_upd: ", Dumper($rc_upd); + } + else + { + my $rc_ins= $col_utp->insert( $full_data); + $cnt_inserted++; + push (@lst_inserted, $eprint_id); + + print __LINE__, " rc_ins: ", Dumper($rc_ins); + } + print __LINE__, ' ', '='x70, "\n"; + } + + my %stats= + ( + agent => 'update_policies', + ts_start => $ts_start, + ts_finish => Util::ts::ts_ISO_gmt(time()), + + cnt_inserted => $cnt_inserted, + cnt_updates => $cnt_updated, + cnt_unchanged => $cnt_unchanged, + lst_inserted => \@lst_inserted, + lst_updated => \@lst_updated, + lst_unchanged => \@lst_unchanged, + ); + + my $col_stats= $db_ot2ut->get_collection('statistics'); + $col_stats->insert( \%stats ); + + $stats{lst_unchanged}= '<deleted>'; + print __LINE__, " stats: ", Dumper(\%stats); + + print __LINE__, " embargo dates: ", Dumper(\%doc_embargo_dates); +} + +sub policies_stats +{ + $db_ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database') unless (defined ($db_ot2ut)); + + # prepare: get info from sync database + my $col_sync= $db_ot2ut->get_collection('sync'); + my $cur_sync= $col_sync->find({}); + + my %synced; + while ($running) + { + my $row_sync= $cur_sync->next(); + last unless (defined ($row_sync)); + + # print __LINE__, " row_sync: ", Dumper($row_sync); last; + my ($eprint_id, $lastmod, $context, $utheses_id)= map { $row_sync->{$_} } qw(eprint_id lastmod context utheses_id); + next unless (defined ($utheses_id)); + + if ($context eq 'ot2ut-test') { $context= 'test'; } + elsif ($context eq 'ot2ut-entw') { $context= 'entw'; } + elsif ($context eq 'ot2ut-prod') { $context= 'prod'; } + + $synced{$eprint_id}->{$context}= $lastmod; + } + + # MAIN PART: analyze othes policies collection + my $col_utp= $db_ot2ut->get_collection('utheses.policies'); + my $cur_utp= $col_utp->find({}); + + my @columns= qw(eprint_status einverstaendnis sperre hds fts docs pub restr errors); + my $cctab= new cctab(columns => \@columns); + + my $max; #= 1000; + my @contexts= qw(entw test prod); + while ($running) + { + my $row_utp= $cur_utp->next(); + last unless (defined ($row_utp)); + # print __LINE__, " row_utp: ", Dumper($row_utp); last; + + my ($eprint_id, $eprint_status, $date_sperre, $einverstaendnis, $lastmod, $full_text_status, $sperre, $cnt_errors, $cnt_warnings, $docs)= + map { $row_utp->{$_} } qw(eprint_id eprint_status date_sperre einverstaendnis lastmod full_text_status sperre cnt_errors cnt_warnings docs); + + my ($cnt_docs, $cnt_public, $cnt_restricted)= + map { $docs->{$_} } qw(cnt_docs cnt_public cnt_restricted); + + my $has_errors= ($cnt_errors > 0) ? 'yes' : 'no'; + my $has_date_sperre= 'no'; + if (defined ($date_sperre)) + { + my $ts_now= Util::ts::ts_ISO3_gmt(time()); + $has_date_sperre=($date_sperre lt $ts_now) ? 'past' : 'future'; + } + + my $bucket= $cctab->bucket($eprint_status, $einverstaendnis, $sperre, $has_date_sperre, $full_text_status, $cnt_docs, $cnt_public, $cnt_restricted, $has_errors); + + $bucket->{othes}++; + push (@{$bucket->{othes_ids}} => $eprint_id); + + my $found_utheses= 0; + foreach my $context (@contexts) + { + if (exists ($synced{$eprint_id}->{$context})) + { + $bucket->{$context}++; + push (@{$bucket->{$context . '_ids'}} => $eprint_id); + $found_utheses++; + } + } + delete($synced{$eprint_id}) if (exists ($synced{$eprint_id})); + + last if (defined($max) && --$max <= 0); + } + + # print __LINE__, " cctab: ", Dumper($cctab); + $cctab->show_tsv(['othes', @contexts]); + + # show objects which were uploaded to utheses but are no longer present in othes + my @synced_not_found= sort { $a <=> $b } keys %synced; + if (@synced_not_found) + { + print __LINE__, " ATTN: ", scalar @synced_not_found, " objects synced to utheses but not present at othes:\n", Dumper (\%synced); } } @@ -3829,7 +4123,6 @@ sub cleanup_keywords (\@notes, \@res); } - package IRMA::ac_number; =head2 check_ac_number ($irma_db, $ac_number, $context, $context_id, $context_url) @@ -3975,6 +4268,121 @@ sub get_ac_errors $irma_db->get_all_x ('ac_numbers', $query); } +package cctab; + +sub new +{ + my $class= shift; + my %par= @_; + + my $self= + { + buckets => {}, + values => [], + }; + bless $self, $class; + + foreach my $par (keys %par) + { + $self->{$par}= $par{$par}; + } + + $self; +} + +sub normalize +{ + my $val= shift; + + if (!defined($val)) { $val= 'NULL'; } + elsif ($val eq '') { $val= 'empty'; } + elsif ($val =~ m#^\d+$# && $val > 1) { $val= '2+'; } + + $val; +} + +sub bucket +{ + my $self= shift; + my @pars= @_; + + my $p= $self->{buckets}; + my $v= $self->{values}; + + my @norm; + for (my $i= 0; $i <= $#pars; $i++) + { + my $par= $pars[$i]; + my $norm= normalize($par); + push (@norm, $norm); + # print __LINE__, " par=[$par] norm=[$norm]\n"; + $p->{$norm}= {} unless (exists($p->{$norm})); + $p= $p->{$norm}; + + $v->[$i]->{$norm}++; + } + + # print __LINE__, " norm=[", join(':', @norm), "]\n"; + + $p; +} + +sub show_tsv +{ + my $self= shift; + my $counters= shift; + my $fnm_counters= shift || 'counters.tsv'; + + my @columns= @{$self->{columns}}; + my $column_count= @columns; + + my @counters= @$counters; + my @heading= ('bucket', @columns, @counters); + + open (TSV, '>:utf8', $fnm_counters) or die; + print __LINE__, " saving bucket_counters to '$fnm_counters'\n"; + print TSV join("\t", @heading), "\n"; + + my $b= $self->{buckets}; + my @rows= (); + + enumerate(\@rows, $b, $column_count, []); + + # print __LINE__, " rows: ", main::Dumper(\@rows); + my $bucket_nr= 0; + foreach my $row (@rows) + { + $bucket_nr++; + my ($vals, $bucket)= @$row; + print TSV join("\t", $bucket_nr, @$vals, map { $bucket->{$_} } @counters), "\n"; + + Util::JSON::write_json_file("bucket_${bucket_nr}.json", $bucket); + } +} + +sub enumerate +{ + my $r= shift; + my $b= shift; + my $c= shift; + my $v= shift; + + foreach my $key (sort keys %$b) + { + my @v= @$v; + push (@v, $key); + + if ($c == 1) + { + push(@$r, [\@v, $b->{$key}]); + } + else + { + enumerate ($r, $b->{$key}, $c-1, \@v); + } + } +} + __END__ =head1 TODO