diff --git a/eprints1.pl b/eprints1.pl index 4f96c50b2dcd0ee08d3beefb80552209b73a9756..cb21900ffbcb172b6e1708ef96df6220fd9a9cad 100755 --- a/eprints1.pl +++ b/eprints1.pl @@ -74,6 +74,7 @@ $Data::Dumper::Sortkeys= 1; use FileHandle; +use utf8; binmode( STDIN, ':utf8' ); binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; binmode( STDERR, ':utf8' ); autoflush STDERR 1; @@ -112,8 +113,8 @@ my $op_mode= 'unknown'; # ====================================================================== # BEGIN OT2UT: Othes to Utheses -# my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize -my $ot2ut_context= 'ot2ut-test'; # TODO: parametrize +my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize +# my $ot2ut_context= 'ot2ut-test'; # TODO: parametrize my $oma_sleep_time= 10; my %map_ot2ut_roles= @@ -172,6 +173,36 @@ if ($0 eq './oma.pl') { $op_mode= 'oma'; $do_upload= 1; } my $flag_add_utheses_policies= 1; my $flag_add_identifiers= 1; +my %bucketlist_column_descriptions= +( + bucket => 'Kategorie bzw. Klassifizierung - wird dynamisch erstellt - somit keine ID', + eprint_status => 'Status [Archiv/Buffer]', + einverstaendnis => 'Werknutzung [TRUE/FALSE/NULL=nicht vorhanden]', + sperre => 'Sperre nach §86 gesetzt [TRUE/FALSE/NULL=nicht vorhanden]', + hds => 'Has Date Sperre [future/past/no]', + fts => 'Full-Text-Status [public/restricted]', + docs => 'Verfügbare Dokumente [1 = Thesis]', + pub => 'Zahl der zugaenglichen Dokumente', + restr => 'Zahl der gesperrten Dokumente', + embargo => 'Zahl der Dokumente mit Embargo', + errors => 'z.B. Datensatz entspricht nicht den Erwartungen', + + # Abbildung bzw. Status in utheses + lrq => 'utheses policy flag: Lock-Request (wird ergaenzt, wenn Sperrdatum vorhanden: wird in Folge für den Import aus Hopla verwendet.)', + lst => 'utheses policy flag: Lock-Status nach §86', + kwl => 'utheses policy flag: Keywords-Locked', + absl => 'utheses policy flag: Abstract-Locked', + ftl => 'utheses policy flag: Full-Text-Locked', + urn => '[ID present/NULL]', + doi => '[ID present/NULL]', + + # Anzahl auf Instanz + othes => 'Zahl der Objekte in Eprints DB (othes)', + entw => 'Zahl der Objekte in utheses Entw', + test => 'Zahl der Objekte in utheses Test', + prod => 'Zahl der Objekte in utheses Prod', +); + my %doc_embargo_dates; # END OT2UT: Othes to Utheses # ====================================================================== @@ -2520,14 +2551,16 @@ sub oma { $ignore_errors= 1; $col_req->update({ _id => $row->{_id}}, { '$set' => { status => 'in_progress' }}); - my $cnt= @{$row->{ids}}; + my @ids; + foreach my $id (@{$row->{ids}}) { push (@ids, $id) if ($id =~ m#^\d+$#); } + my $cnt= @ids; my $msg= "send_ids: sending $cnt objects to $ot2ut_context"; activity({ activity => 'send_batch', msg => $msg}); send_message($msg); $silent_upload_success= 0; - my ($synced, $res)= ot2ut(@{$row->{ids}}); + my ($synced, $res)= ot2ut(@ids); send_message("send_ids: $res"); $new_status= 'done' if (@$synced); } @@ -2651,7 +2684,7 @@ sub ot2ut } } - my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= + my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $docs, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id); my ($eprint_status)= map { $row->{$_} } qw(eprint_status); @@ -2665,7 +2698,7 @@ sub ot2ut if ($sync_info->{lastmod} eq $lastmod) { - print __LINE__, " already synced; skipping...\n"; + print __LINE__, " eprint_id=[$eprint_id] already synced; skipping...\n"; next; } else @@ -2680,7 +2713,10 @@ sub ot2ut my $ts_upload= ts_ISO_gmt(); - if (!defined ($files) || ref($files) ne 'ARRAY' || @$files != 1) + if (!defined ($files) || ref($files) ne 'ARRAY' || @$files < 1 + # || @$files != 1 # no attachments allowed yet + || @$files > 10 # testing ... + ) { push (@$errors, { error => 'num_files', note => 'currently limited to objects with exactly one file' } ); } @@ -2722,8 +2758,9 @@ sub ot2ut my $upload_cnf= $cnf->{$ot2ut_context}; die "no valid ot2ut context" unless (defined ($upload_cnf)); - my $main_file= $files->[0]; - my ($local_filename, $lfnm)= map { $main_file->{$_} } qw(path upl_fnm); + my @docs= @{$docs->{documents}}; + my $main_file= shift(@docs); + my ($local_filename, $lfnm)= map { $main_file->{$_} } qw(path_doc main); # TODO: use curl for now my @upload_cmd= (qw(/usr/bin/curl -X POST -v -H Content-Type:multipart/form-data -F), 'metadata=@' . $utheses_json_path, @@ -2739,6 +2776,7 @@ sub ot2ut } print __LINE__, " upload_cmd: [", join(' ', @upload_cmd), "]\n"; + my $utheses_id; if ($do_upload) { my $t_curl= time(); @@ -2804,11 +2842,13 @@ old format 2019-11..2020-01 =end comment =cut - my ($status, $import_status, $utheses_id, $response_msg, $alerts)= map { $result_data->{$_} } qw(status importStatus uthesesId responseMsg alerts); + my ($status, $import_status, $utheses_id1, $response_msg, $alerts)= map { $result_data->{$_} } qw(status importStatus uthesesId responseMsg alerts); + print __LINE__, " status=[$status] response_msg=[$response_msg]\n"; my $td_start= time()-$t_start; my $td_curl= time()-$t_curl; + my $upload_success; my $out_row= { eprint_id => $eprint_id, @@ -2817,20 +2857,134 @@ old format 2019-11..2020-01 context => $ot2ut_context, ts_upload => $ts_upload, td_total => $td_start, - error_code => 'ok', - error_cnt => 0, - utheses_id => $utheses_id, + error_code => 'unknown', + utheses_id => $utheses_id1, uploaded_fnm => $lfnm, + upload_status => $status, response_msg => $response_msg, }; + if (defined ($utheses_id1) && $status eq '200') + { + $utheses_id= $utheses_id1; + $out_row->{error_code}= $upload_success= 'ok'; + $out_row->{error_cnt}= 0; + } + else + { + $out_row->{error_code}= $upload_success= 'error'; + } + push (@synced, $out_row); $col_sync->insert($out_row); - send_message("upload success: eprint_id=[$eprint_id] eprint_status=[$eprint_status] lastmod=[$lastmod] context=[$ot2ut_context] utheses_id=[$utheses_id] time_total=$td_start time_upload=$td_curl") unless ($silent_upload_success); + send_message("upload $upload_success: eprint_id=[$eprint_id] eprint_status=[$eprint_status] lastmod=[$lastmod] context=[$ot2ut_context] utheses_id=[$utheses_id] time_total=$td_start time_upload=$td_curl") unless ($silent_upload_success && $upload_success eq 'ok'); } sleep(2); + + if (defined ($utheses_id)) + { + # process remaining documents as attachments + my $attachment_number= 0; + while (my $attachment= shift(@docs)) + { + $attachment_number++; + my $fnm_attachment_md= 'othes/utheses_json/' . $eprint_id . '_' . $attachment_number . '_md.json'; + my $fnm_attachment_res= 'othes/utheses_json/' . $eprint_id . '_' . $attachment_number . '_res.json'; + my $fnm_attachment_chg= 'othes/utheses_json/' . $eprint_id . '_' . $attachment_number . '_chg.json'; + + my $attachment_md= + { + attachment => my $ai= + { + file_name => $attachment->{main}, + file_mime_type => $attachment->{mime_type}, + lock_status => ($attachment->{security} eq 'public') ? 0 : 1, + upload_date => Util::ts::ts_gmdate(), + uploaded_by => 'ot2ut', + size => int($attachment->{stat_doc}->[7] / 1024) . ' KB', + }, + origin => "admin", + rights_statement => "http://rightsstatements.org/vocab/InC/1.0/", + data_model_type => "attachment", + }; + + $ai->{embargo_until_date}= join ('-', map { $attachment->{$_} } qw(date_embargo_year date_embargo_month date_embargo_day)) if (exists ($attachment->{date_embargo})); + $ai->{description}= $attachment->{formatdesc} if (defined ($attachment->{formatdate})); + + Util::JSON::write_json_file($fnm_attachment_md, $attachment_md); + +# curl -X POST http://localhost:3000/attachment/add/#uthesesId/fromOthes -F "metadata=@uthesesDM.json" -F "file=@attachment.jpg" + my $url1= join ('/', $upload_cnf->{api_url}, qw(attachment add), $utheses_id, 'fromOthes'); + my @attachment_add_cmd= (qw(/usr/bin/curl -X POST -v -H Content-Type:multipart/form-data -F), + 'metadata=@' . $fnm_attachment_md, '-F', 'file=@' . $attachment->{path_doc}, $url1, '-o' . $fnm_attachment_res); + + if (exists ($upload_cnf->{headers})) + { + foreach my $header (@{$upload_cnf->{headers}}) + { + push (@attachment_add_cmd, '--header', $header ); + } + } + + print __LINE__, " attachment_add_cmd: [", join(' ', @attachment_add_cmd), "]\n"; + if ($do_upload) + { + my $t_curl= time(); + system(@attachment_add_cmd); + + my $result_data1; + eval { $result_data1= Util::JSON::read_json_file($fnm_attachment_res); }; + if ($@) + { + print __LINE__, " can't parse upload_result; error=[$@]\n"; + } + else + { + print __LINE__, " upload attchment [$attachment_number] result: ", Dumper($result_data1); + my ($attachment_pid, $curl_status, $response_msg)= map { $result_data1->{$_ } } qw(attachmentPid status responseMsg); + + send_message("upload attachment success: eprint_id=[$eprint_id] context=[$ot2ut_context] utheses_id=[$utheses_id] attachment_number=[$attachment_number] attachment_pid=[$attachment_pid] curl_status=[$curl_status]"); # unless ($silent_upload_success); + +# curl -X POST http://localhost:3000/attachment/changeStatus/#attachmentPid -F "status=A" + my $url2= join ('/', $upload_cnf->{api_url}, qw(attachment changeStatus), $attachment_pid); + my @attachment_chg_cmd= (qw(/usr/bin/curl -X POST -v -H Content-Type:multipart/form-data -F status=A), + $url2, '-o' . $fnm_attachment_chg); + + if (exists ($upload_cnf->{headers})) + { + foreach my $header (@{$upload_cnf->{headers}}) + { + push (@attachment_chg_cmd, '--header', $header ); + } + } + + print __LINE__, " attachment_chg_cmd: [", join(' ', @attachment_chg_cmd), "]\n"; + my $t_curl= time(); + system(@attachment_chg_cmd); + + my $result_data2; + eval { $result_data2= Util::JSON::read_json_file($fnm_attachment_res); }; + if ($@) + { + print __LINE__, " can't parse upload_result; error=[$@]\n"; + } + else + { + print __LINE__, " change attchment [$attachment_number] result: ", Dumper($result_data2); + # my $attachment_pid= map { $result_data1->{$_ } } qw(attachmentPid status responseMsg); + } + + } + + } + } + } + else + { + # no utheses_id defined + } } } @@ -2885,19 +3039,33 @@ sub generate_utheses_metadata my $history= get_history($epr_db, $eprintid); # print __LINE__, " history: ", Dumper($history); exit; - my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(eprintid fileinfo dir)); + my ($eprint_status, $fileinfo, $dir)= map { $row->{$_} } qw(eprint_status fileinfo dir); + + my $docs= get_documents($epr_db, $eprintid, $dir); + print __LINE__, " docs: ", Dumper($docs); + + my ($lang_pdf, $files)= analyze_files($eprintid, $fileinfo, $dir); print __LINE__, " lang_pdf=[$lang_pdf] files: ", Dumper($files); - my $main_file; - if (@$files) - { - $main_file= $files->[0]; - } - else + + # merge info from get_documents() and analyze_files() + my @d= @{$docs->{documents}}; + push (@errors, { error => 'no_file' }) unless (@d); + my $main_file= $d[0]; # documents are ordered by their "pos" or "placement" attribute; the first on is the main document + my %docs= map { $_->{path_doc} => $_ } @d; + + foreach my $fileinfo (@$files) { - push (@errors, { error => 'no_file' }); + my $doc= $docs{$fileinfo->{path}}; + unless (defined ($doc)) + { + push (@errors, { error => 'document_mismatch', fileinfo => $fileinfo }); + next; + } + $doc->{fileinfo}= $fileinfo; } - my ($eprint_status)= map { $row->{$_} } qw(eprint_status); + print __LINE__, " docs: ", Dumper($docs); + print __LINE__, " main_file: ", Dumper($main_file); my $utheses_json_path= 'othes/utheses_json/' . $eprintid . '.json'; my $utheses_upload_result_json_path= 'othes/utheses_json/' . $eprintid . '_upload_result.json'; @@ -2920,8 +3088,9 @@ sub generate_utheses_metadata print __LINE__, " row: ", Dumper($row) if ($debug_level > 2); - my ($err_st_id, $study_id, $studies_codes)= get_study_id($row->{matr}, $row->{studienkennzahl}); - push (@errors, @$err_st_id) if (@$err_st_id); + my ($err_st_id, $warn_st_id, $study_id, $studies_codes)= get_study_id($row->{matr}, $row->{studienkennzahl}); + push (@errors, @$err_st_id) if (@$err_st_id); + push (@warnings, @$warn_st_id) if (@$warn_st_id); my $ut= new Univie::Utheses::Container(); @@ -3074,11 +3243,11 @@ sub generate_utheses_metadata # 2020-05-14 nd: not needed: $thesis->{uploaded_by}= 'ot2ut'; $thesis->{subject_classifications}= $classifications; - $thesis->{number_of_pages}= "$main_file->{page_count}"; # Phaidra expects this as a string + $thesis->{number_of_pages}= "$main_file->{fileinfo}->{page_count}"; # Phaidra or utheses expects this as a string # 2020-06-30: modify filename for files containing sensitive information, but keep the original filename - $thesis->{original_filename}= $main_file->{orig_fnm}; - $thesis->{upload_filename}= $main_file->{upl_fnm}; + $thesis->{original_filename}= $main_file->{fileinfo}->{orig_fnm}; + $thesis->{upload_filename}= $main_file->{fileinfo}->{upl_fnm}; $ut->public('thesis', $thesis); @@ -3091,23 +3260,25 @@ sub generate_utheses_metadata Util::JSON::write_json_file($utheses_json_path, $ut->{public}); - (\@errors, \@warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path); + (\@errors, \@warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $docs, $utheses_upload_result_json_path); } sub get_documents { my $epr_db= shift; my $eprint_id= shift; + my $dir= shift; my $document_rows= $epr_db->get_all_x('document', ['eprintid=?', $eprint_id]); # print __LINE__, " document_rows: ", Dumper($document_rows); - my @documents; + my @dirs= split('/', $dir); + shift(@dirs); # remove "disk0" from the beginning + my @notes; my $res= { eprint_id => $eprint_id, - documents => \@documents, cnt_docs => 0, cnt_public => 0, cnt_restricted => 0, @@ -3116,17 +3287,25 @@ sub get_documents notes => \@notes, }; + my @docs; DOCUMENT: foreach my $document_id (keys %$document_rows) { my $row= $document_rows->{$document_id}; + # print __LINE__, " document_row: ", Dumper($row); + my ($main, $idx)= map { $row->{$_} } qw(main pos); + if ($row->{main} eq 'indexcodes.txt' || $row->{main} eq 'preview.png') { # ignore these ... next DOCUMENT; } - # print __LINE__, " document_row: ", Dumper($row); - my $idx= $row->{pos}; + my $rev_dir= sprintf("%02d", $idx); + my $path_doc= join('/', '/backup/othes/eprints', @dirs, $rev_dir, $main); # TODO: parametrize! + my (@stat_doc)= stat($path_doc); + $row->{path_doc}= $path_doc; + $row->{stat_doc}= \@stat_doc; + if (defined ($row->{placement}) && $row->{pos} != $row->{placement}) { push (@notes, "pos != placement: pos=[$row->{pos}] placement=[$row->{placement}]"); @@ -3136,21 +3315,21 @@ sub get_documents $idx--; # NOTE: pos and placement start at 1 # print __LINE__, " idx=[$idx]\n"; - if (defined($documents[$idx])) + if (defined($docs[$idx])) { push (@notes, "already a document at index=[$idx]"); $res->{show}++; - push (@documents, $row); + push (@docs, $row); } elsif ($idx < 0) { push (@notes, "negative index=[$idx]"); $res->{show}++; - push (@documents, $row); + push (@docs, $row); } else { - $documents[$idx]= $row; + $docs[$idx]= $row; } $res->{cnt_docs}++; @@ -3178,12 +3357,15 @@ sub get_documents } } - my $idx_first_public; - foreach my $doc (@documents) + # document list may contain holes, that is, undefined items, so we remove these + my @docs2; + foreach my $doc (@docs) { - + push (@docs2, $doc) if (defined ($doc)); } + $res->{documents}= \@docs2, + $res; } @@ -3232,9 +3414,7 @@ sub get_study_id my $matr= shift; my $stkz= shift; - my @errors= (); - my @stkz; - my $studies_codes; + my (@errors, @warnings, @stkz, $studies_codes); # NOTES: # * Bildungseinrichtungs-ID (here university_code) see @@ -3255,9 +3435,9 @@ sub get_study_id while ($digits =~ s#^(\d\d\d)#{push(@stkz,$1);''}#ge){}; if ($digits ne '') { - if ($digits =~ m#^[23]$# && !defined ($coop_kz)) + if ($digits =~ m#^0?(2)$# && !defined ($coop_kz)) { - $coop_kz= $digits; + $coop_kz= $1; } else { @@ -3272,7 +3452,7 @@ sub get_study_id if (defined ($coop_kz)) { $studies_codes->{cooperation_code}= 'U'. $coop_kz if ($coop_kz =~ m#[A-Z]#); - $studies_codes->{cooperation_code}= '0'. $coop_kz if ($coop_kz =~ m#[23]#); + $studies_codes->{cooperation_code}= '0'. $coop_kz if ($coop_kz =~ m#^2$#); } } else @@ -3287,12 +3467,12 @@ sub get_study_id } else { - push (@errors, { error => 'stkz_empty' }); + push (@warnings, { warning => 'stkz_empty' }); } my $study_id= join ('', $matr, @stkz); - (\@errors, $study_id, $studies_codes); + (\@errors, \@warnings, $study_id, $studies_codes); } sub get_thesis_data @@ -3499,7 +3679,7 @@ sub get_names_for_role { my $names= $row->{$column_name}; print __LINE__, " names=[$names]\n"; - next if ($names eq ''); + next if ($names eq '' || $names eq '-' || $names eq 'k. A.' || $names =~ m#nicht angegeben *#); my @names= split (/\s*;\s*/, $names); foreach my $name (@names) @@ -3546,6 +3726,9 @@ sub analyze_files $format= 'pdf' if ($icon eq '/style/images/fileicons/pdf.png' || $icon eq '/style/images/fileicons/application_pdf.png'); $filepath =~ s#%([\dA-Fa-f]{2})#chr(hex($1))#ge; # filenames are URL encoded, see 19072 for an example + print __LINE__, " vor utf8::decode filepath=[$filepath]\n"; + utf8::decode($filepath); + print __LINE__, " nach utf8::decode filepath=[$filepath]\n"; my @filepath= split('/', $filepath); my $upl_fnm= my $fnm= pop(@filepath); @@ -3568,29 +3751,35 @@ sub analyze_files print __LINE__, " dir=[$dir] fi=[$fi] fnm=[$fnm] ext=[$ext]\n"; my $rev_dir= sprintf("%02d", pop(@filepath)); - my $path_pdf= join('/', '/backup/othes/eprints', @dirs, $rev_dir, $fnm); - my $path_txt= join('/', '/backup/othes/eprints', @dirs, $rev_dir, join ('.', @fnm, 'txt')); + my $path_pdf= join('/', '/backup/othes/eprints', @dirs, $rev_dir, $fnm); # TODO: parametrize! print __LINE__, " path_pdf=[$path_pdf]\n"; - print __LINE__, " path_txt=[$path_txt]\n"; - my @st_pdf= stat($path_pdf); - my @st_txt= stat($path_txt); - if (!@st_txt || $st_txt[9] < $st_pdf[9]) - { - system ('pdftotext', $path_pdf); - } + push (@files, my $f_obj= { format => $format, path => $path_pdf, orig_fnm => $fnm, upl_fnm => $upl_fnm,}); - my $verdict= `/home/gg/bin/guesslang.py '$path_txt'`; - print __LINE__, " guesslang: verdict=[$verdict]\n"; - my @pages= split("\n", $verdict); - foreach my $page (@pages) + if ($format eq 'pdf') { - my ($lang)= split(' ', $page); - # print __LINE__, " lang=[$lang]\n"; - $lang{$lang}++; - } + my $path_txt= join('/', '/backup/othes/eprints', @dirs, $rev_dir, join ('.', @fnm, 'txt')); + print __LINE__, " path_txt=[$path_txt]\n"; + + my @st_pdf= stat($path_pdf); + my @st_txt= stat($path_txt); + if (!@st_txt || $st_txt[9] < $st_pdf[9]) + { + system ('pdftotext', $path_pdf); + } - push (@files, { format => $format, path => $path_pdf, orig_fnm => $fnm, upl_fnm => $upl_fnm, page_count => scalar @pages }); + my $verdict= `/home/gg/bin/guesslang.py '$path_txt'`; + print __LINE__, " guesslang: verdict=[$verdict]\n"; + my @pages= split("\n", $verdict); + foreach my $page (@pages) + { + my ($lang)= split(' ', $page); + # print __LINE__, " lang=[$lang]\n"; + $lang{$lang}++; + } + + $f_obj->{page_count}= scalar @pages; + } } my $max= 0; @@ -3820,7 +4009,7 @@ sub debug_stkz my ($id, $stkz, $matr)= map { $r->{$_} } @col_names_db; push (@{$stkz{$stkz}}, $id); - my ($errors, $study_id, $studies_codes)= get_study_id($matr, $stkz); + my ($errors, $warnings, $study_id, $studies_codes)= get_study_id($matr, $stkz); my $stcd= join(':', map { $studies_codes->{$_} } qw(university_code code_1 code_2 code_3 cooperation_code)); # print __LINE__, " id=[$id] stcd=[$stcd] ", Dumper($studies_codes); @@ -3891,10 +4080,9 @@ sub update_policies print __LINE__, " eprint_id=[$eprint_id] x1_lastmod=[$x1_lastmod]\n"; print __LINE__, ' ', '='x70, "\n"; - my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= + my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $docs, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id); - my $docs= get_documents($epr_db, $eprint_id); my $show= $docs->{show}; my $cnt_errors= @$errors; @@ -3980,6 +4168,7 @@ sub policies_stats my $cur_sync= $col_sync->find({}); my %synced; + my (@upload_errors, %upload_errors); while ($running) { my $row_sync= $cur_sync->next(); @@ -3987,7 +4176,12 @@ sub policies_stats # print __LINE__, " row_sync: ", Dumper($row_sync); last; my ($eprint_id, $lastmod, $context, $utheses_id)= map { $row_sync->{$_} } qw(eprint_id lastmod context utheses_id); - next unless (defined ($utheses_id)); + unless (defined ($utheses_id)) + { + # push (@upload_errors, $row_sync); + push (@{$upload_errors{$row_sync->{error_code}}}, $eprint_id); + next; + } if ($context eq 'ot2ut-test') { $context= 'test'; } elsif ($context eq 'ot2ut-entw') { $context= 'entw'; } @@ -4000,10 +4194,10 @@ sub policies_stats my $col_utp= $db_ot2ut->get_collection('utheses.policies'); my $cur_utp= $col_utp->find({}); - my @columns= qw(eprint_status einverstaendnis sperre hds fts docs pub restr errors); + my @columns= qw(eprint_status einverstaendnis sperre hds fts docs pub restr embargo errors); if ($flag_add_utheses_policies) { - push (@columns, qw(lrq kwl absl ftl)); + push (@columns, qw(lrq lst kwl absl ftl)); } if ($flag_add_identifiers) { @@ -4015,6 +4209,9 @@ sub policies_stats my $max; #= 1000; my @contexts= qw(entw test prod); my (%cnt_errors, %cnt_warnings, %all_errors); + my @lst_nonpublic_doc_first; # list of objects with cnt_public > 1 && cnt_restricted > 1 && first document is not public + my @lst_docs_with_notes; + my %docs_with_holes; while ($running) { my $row_utp= $cur_utp->next(); @@ -4025,8 +4222,26 @@ sub policies_stats map { $row_utp->{$_} } qw(eprint_id eprint_status date_sperre einverstaendnis lastmod full_text_status sperre cnt_errors cnt_warnings errors warnings docs ut_public urn doi ac_nummer); $eprint_id += 0; - my ($cnt_docs, $cnt_public, $cnt_restricted)= - map { $docs->{$_} } qw(cnt_docs cnt_public cnt_restricted); + my ($cnt_docs, $cnt_public, $cnt_restricted, $cnt_embargo)= + map { $docs->{$_} } qw(cnt_docs cnt_public cnt_restricted cnt_embargo); + + push (@lst_nonpublic_doc_first, $eprint_id) if ($cnt_public > 1 && $cnt_restricted > 1 && $docs->{documents}->[0]->{security} ne 'public'); + push (@lst_docs_with_notes, $eprint_id) if (@{$docs->{notes}} > 0); + + foreach my $doc (@{$docs->{documents}}) + { + unless (defined ($doc)) + { + $docs_with_holes{$eprint_id}++; + next; + } + + if (exists ($doc->{date_embargo})) + { + my $d= $doc->{date_embargo}; + if ($d eq '2099-12-31T00:00:00Z') { $doc_embargo_dates{$d}++; } else { push (@{$doc_embargo_dates{$d}}, $eprint_id); } + } + } my $has_errors= ($cnt_errors > 0) ? 'yes' : 'no'; my $has_date_sperre= 'no'; @@ -4036,9 +4251,9 @@ sub policies_stats $has_date_sperre=($date_sperre lt $ts_now) ? 'past' : 'future'; } - my @bucket_selectors= ($eprint_status, $einverstaendnis, $sperre, $has_date_sperre, $full_text_status, $cnt_docs, $cnt_public, $cnt_restricted, $has_errors); + my @bucket_selectors= ($eprint_status, $einverstaendnis, $sperre, $has_date_sperre, $full_text_status, $cnt_docs, $cnt_public, $cnt_restricted, $cnt_embargo, $has_errors); - my ($utptp, $lrq, $kwl, $absl, $ftl); + my ($utptp, $lrq, $lst, $kwl, $absl, $ftl); my $utpt= $utp->{thesis}; if (defined ($utpt)) { @@ -4047,12 +4262,12 @@ sub policies_stats if (defined ($utptp)) { - ($lrq, $kwl, $absl, $ftl)= map { $utptp->{$_} } qw(lock_request keywords_locked abstract_locked fulltext_locked); + ($lrq, $lst, $kwl, $absl, $ftl)= map { $utptp->{$_} } qw(lock_request lock_status keywords_locked abstract_locked fulltext_locked); } if ($flag_add_utheses_policies) { - push (@bucket_selectors, $lrq, $kwl, $absl, $ftl); + push (@bucket_selectors, $lrq, $lst, $kwl, $absl, $ftl); } if ($flag_add_identifiers) @@ -4072,9 +4287,10 @@ sub policies_stats foreach my $error (@$errors) { my $error_name= $error->{error}; + $error_name= 'bad_name_buffer' if ($error_name eq 'bad_name' && $eprint_status eq 'buffer'); $cnt_errors{$error_name}++; - $error->{eprint_id}= $eprint_id; + $error->{eprint_id}= "$eprint_id"; push (@{$all_errors{$error_name}}, $error); } } @@ -4108,9 +4324,9 @@ sub policies_stats } # print __LINE__, " working: cctab: ", Dumper($cctab); - # annotations - my @s1= qw(archive FALSE FALSE no restricted 1 1 0 no); - push (@s1, qw( 0 0 0 1)) if ($flag_add_utheses_policies); + # BEGIN annotations + my @s1= qw(archive FALSE FALSE no restricted 1 1 0 0 no); + push (@s1, qw(0 0 0 0 1)) if ($flag_add_utheses_policies); push (@s1, qw(present NULL)) if ($flag_add_identifiers); my $b1= $cctab->bucket(0, @s1); if (defined ($b1)) @@ -4119,8 +4335,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b1); } - my @s2= qw(archive FALSE FALSE no public 1 1 0 no); - push (@s2, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s2= qw(archive FALSE FALSE no public 1 1 0 0 no); + push (@s2, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s2, qw(present NULL)) if ($flag_add_identifiers); my $b2= $cctab->bucket(0, @s2); if (defined ($b2)) @@ -4129,8 +4345,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b2); } - my @s2b= qw(archive TRUE FALSE no public 1 1 0 no); - push (@s2b, qw(0 1 1 0)) if ($flag_add_utheses_policies); + my @s2b= qw(archive TRUE FALSE no public 1 1 0 0 no); + push (@s2b, qw(0 0 1 1 0)) if ($flag_add_utheses_policies); push (@s2b, qw(present NULL)) if ($flag_add_identifiers); my $b2b= $cctab->bucket(0, @s2b); if (defined ($b2b)) @@ -4139,8 +4355,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b2b); } - my @s2c= qw(archive TRUE NULL no public 1 1 0 no); - push (@s2c, qw(0 1 1 0)) if ($flag_add_utheses_policies); + my @s2c= qw(archive TRUE NULL no public 1 1 0 0 no); + push (@s2c, qw(0 0 1 1 0)) if ($flag_add_utheses_policies); push (@s2c, qw(present NULL)) if ($flag_add_identifiers); my $b2c= $cctab->bucket(0, @s2c); if (defined ($b2c)) @@ -4148,9 +4364,10 @@ sub policies_stats $b2c->{annotation}= { bgcolor => 'pink', note => 'abstract und keywords locked, aber pdf public' }; print __LINE__, " special bucket found: ", Dumper($b2c); } + # ZZZZ - my @s3= qw(archive TRUE FALSE no restricted 1 1 0 no); - push (@s3, qw(0 0 0 1)) if ($flag_add_utheses_policies); + my @s3= qw(archive TRUE FALSE no restricted 1 1 0 0 no); + push (@s3, qw(0 0 0 0 1)) if ($flag_add_utheses_policies); push (@s3, qw(present NULL)) if ($flag_add_identifiers); my $b3= $cctab->bucket(0, @s3); if (defined ($b3)) @@ -4159,8 +4376,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b3); } - my @s4a= qw(archive TRUE NULL no public 1 1 0 no); - push (@s4a, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s4a= qw(archive TRUE NULL no public 1 1 0 0 no); + push (@s4a, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s4a, qw(present present)) if ($flag_add_identifiers); my $b4a= $cctab->bucket(0, @s4a); if (defined ($b4a)) @@ -4169,8 +4386,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b4a); } - my @s4b= qw(archive TRUE NULL no public 1 1 0 yes); - push (@s4b, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s4b= qw(archive TRUE NULL no public 1 1 0 0 yes); + push (@s4b, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s4b, qw(NULL NULL)) if ($flag_add_identifiers); my $b4b= $cctab->bucket(0, @s4b); if (defined ($b4b)) @@ -4179,8 +4396,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b4b); } - my @s4c= qw(archive TRUE NULL no public 1 1 0 yes); - push (@s4c, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s4c= qw(archive TRUE NULL no public 1 1 0 0 yes); + push (@s4c, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s4c, qw(present present)) if ($flag_add_identifiers); my $b4c= $cctab->bucket(0, @s4c); if (defined ($b4c)) @@ -4189,8 +4406,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b4c); } - my @s5a= qw(archive TRUE FALSE no public 1 1 0 no); - push (@s5a, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s5a= qw(archive TRUE FALSE no public 1 1 0 0 no); + push (@s5a, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s5a, qw(present present)) if ($flag_add_identifiers); my $b5a= $cctab->bucket(0, @s5a); if (defined ($b5a)) @@ -4199,8 +4416,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b5a); } - my @s5b= qw(archive TRUE FALSE no public 1 1 0 no); - push (@s5b, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s5b= qw(archive TRUE FALSE no public 1 1 0 0 no); + push (@s5b, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s5b, qw(present NULL)) if ($flag_add_identifiers); my $b5b= $cctab->bucket(0, @s5b); if (defined ($b5b)) @@ -4209,8 +4426,8 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b5b); } - my @s5c= qw(archive TRUE FALSE no public 1 1 0 no); - push (@s5c, qw(0 0 0 0)) if ($flag_add_utheses_policies); + my @s5c= qw(archive TRUE FALSE no public 1 1 0 0 no); + push (@s5c, qw(0 0 0 0 0)) if ($flag_add_utheses_policies); push (@s5c, qw(NULL NULL)) if ($flag_add_identifiers); my $b5c= $cctab->bucket(0, @s5c); if (defined ($b5c)) @@ -4219,31 +4436,57 @@ sub policies_stats print __LINE__, " special bucket found: ", Dumper($b5c); } - my @s6a= qw(archive FALSE FALSE no restricted 1 0 1 no); - push (@s6a, qw(0 0 0 1)) if ($flag_add_utheses_policies); + my @s6a= qw(archive FALSE FALSE no restricted 1 0 1 1 no); + push (@s6a, qw(0 0 0 0 1)) if ($flag_add_utheses_policies); push (@s6a, qw(present NULL)) if ($flag_add_identifiers); my $b6a= $cctab->bucket(0, @s6a); if (defined ($b6a)) { - $b6a->{annotation}= { bgcolor => 'lightblue', note => 'kein Volltext' }; + $b6a->{annotation}= { bgcolor => 'lightblue', note => 'kein Volltext; NBN schon vergeben' }; print __LINE__, " special bucket found: ", Dumper($b6a); } - my @s6b= qw(archive FALSE FALSE no restricted 1 0 1 no); - push (@s6b, qw(0 0 0 1)) if ($flag_add_utheses_policies); + my @s6a2= qw(archive FALSE FALSE no restricted 1 0 1 1 no); + push (@s6a2, qw(0 0 0 0 1)) if ($flag_add_utheses_policies); + push (@s6a2, qw(NULL NULL)) if ($flag_add_identifiers); + my $b6a2= $cctab->bucket(0, @s6a2); + if (defined ($b6a)) + { + $b6a2->{annotation}= { bgcolor => 'lightblue', note => 'kein Volltext; NBN noch nicht vergeben' }; + print __LINE__, " special bucket found: ", Dumper($b6a2); + } + + my @s6b= qw(archive FALSE FALSE no restricted 1 0 1 1 no); + push (@s6b, qw(0 0 0 0 1)) if ($flag_add_utheses_policies); push (@s6b, qw(present present)) if ($flag_add_identifiers); my $b6b= $cctab->bucket(0, @s6b); if (defined ($b6b)) { - $b6b->{annotation}= { bgcolor => ' #33a2ff', note => 'kein Volltext; DOI wurde offenbar vorher vergeben' }; + $b6b->{annotation}= { bgcolor => '#33a2ff', note => 'Volltext gesperrt; DOI wurde offenbar vorher vergeben' }; + print __LINE__, " special bucket found: ", Dumper($b6b); + } + + my @s6c= qw(archive FALSE FALSE no restricted 1 0 1 0 no); + push (@s6c, qw(0 0 0 0 1)) if ($flag_add_utheses_policies); + push (@s6c, qw(present NULL)) if ($flag_add_identifiers); + my $b6c= $cctab->bucket(0, @s6c); + if (defined ($b6c)) + { + $b6c->{annotation}= { bgcolor => 'pink', note => 'Volltext restrected, kein Embargo' }; print __LINE__, " special bucket found: ", Dumper($b6b); } + # END annotations # print __LINE__, " cctab: ", Dumper($cctab); my $trailer= "<h2>errors</h2>\n". Dumper(\%cnt_errors); $trailer .= "<h2>warnings</h2>\n". Dumper(\%cnt_warnings); print __LINE__, " trailer=[",$trailer,"]\n"; + $trailer .= "<h2>nonpublic_doc_first</h2>\n". Dumper(\@lst_nonpublic_doc_first) if (@lst_nonpublic_doc_first); + $trailer .= "<h2>docs with notes</h2>\n". Dumper(\@lst_docs_with_notes) if (@lst_docs_with_notes); + $trailer .= "<h2>embargo dates</h2>\n". Dumper(\%doc_embargo_dates); + $trailer .= "<h2>upload_errors</h2>\n". Dumper(\%upload_errors); + $cctab->show_tsv(['othes', @contexts], 'counters.tsv', $trailer); # show objects which were uploaded to utheses but are no longer present in othes @@ -4254,6 +4497,8 @@ sub policies_stats } Util::JSON::write_json_file('all_errors.json', \%all_errors); + # Util::JSON::write_json_file('upload_errors.json', \@upload_errors); + Util::JSON::write_json_file('docs_with_holes.json', \%docs_with_holes); } sub cleanup_keywords @@ -4611,7 +4856,11 @@ EOX my $path_lst= join ('/', $base_path, $fnm_lst); open (LST, '>:utf8', $path_lst) or die; print LST <<"EOX"; +<head> +<meta charset="UTF-8"> <title>bucket $bucket_nr</title> +</head> +<body> <table> EOX @@ -4619,7 +4868,8 @@ EOX foreach my $col (@columns) { my $val= shift (@vals); - print LST "<tr><th>$col</th><td>$val</td></tr>\n"; + my $desc= $bucketlist_column_descriptions{$col} || + print LST "<tr><th>$col</th><td>$val</td><td>$desc</td></tr>\n"; } print LST "</table>\n"; @@ -4713,9 +4963,12 @@ EOX if (defined($trailer)) { print IDX $trailer; - print __LINE__, " trailer=[",$trailer,"]\n"; + # print __LINE__, " trailer=[",$trailer,"]\n"; } + print IDX <<EOX; +</body> +EOX close (IDX); close (TSV); } @@ -4897,4 +5150,3 @@ Notes: * & in abstracts is encoded as & in the DataCite.XML file, but DataCite itself seems to strip that out again * as of 2020-01-20, title is processed with strip_text() -