Skip to content
Snippets Groups Projects
Commit 0b8aeced authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

modifications for utheses uploads

parent e50bdf16
No related branches found
No related tags found
No related merge requests found
......@@ -115,8 +115,8 @@ my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize
my %map_ot2ut_roles=
(
'advisors' => [qw(betreuer betreuer_2 betreuer_3)],
'coadvisors' => [qw(mitbetreuer mitbetreuer_2)],
'advisers' => [qw(betreuer betreuer_2 betreuer_3)],
'coadvisers' => [qw(mitbetreuer mitbetreuer_2)],
'assessors' => [qw(beurteiler_1 beurteiler_2 beurteiler_3)],
);
......@@ -203,6 +203,7 @@ while (defined ($arg= shift (@ARGV)))
elsif ($opt eq 'debug_names') { $op_mode= 'debug_names'; }
elsif ($opt eq 'debug_classifications' || $opt eq 'DC') { $op_mode= 'debug_classifications'; }
elsif ($opt eq 'debug_keywords') { $op_mode= 'debug_keywords'; }
elsif ($opt eq 'debug_abstracts') { $op_mode= 'debug_abstracts'; }
elsif ($opt eq 'debug_stkz') { $op_mode= 'debug_stkz'; }
elsif ($opt eq 'max') { $MAX_SYNC= $val || shift (@ARGV); }
elsif ($opt eq 'mab-age') { $MAX_MAB_AGE= $val || shift (@ARGV); } # in seconds
......@@ -341,6 +342,10 @@ elsif ($op_mode eq 'debug_keywords')
{
debug_keywords();
}
elsif ($op_mode eq 'debug_abstracts')
{
debug_abstracts();
}
elsif ($op_mode eq 'debug_classifications')
{
# print "cnf: ", Dumper ($cnf);
......@@ -2453,6 +2458,7 @@ sub ot2ut
my $ot2ut= get_any_db($cnf, 'ot2ut_database');
my $col_sync= $ot2ut->get_collection('sync');
my $col_msg= $ot2ut->get_collection('messages');
unless (defined ($utheses_faculty_map))
{
......@@ -2484,6 +2490,8 @@ sub ot2ut
{
last if (defined ($MAX_SYNC) && $cnt_synced >= $MAX_SYNC);
my $t_start= time();
my $sync_info= $col_sync->find_one({eprint_id => $eprint_id});
my ($errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id);
......@@ -2511,7 +2519,7 @@ sub ot2ut
}
else
{
print __LINE__, " earlier sync attempt had errors, retgrying...\n";
print __LINE__, " earlier sync attempt had errors, retrying...\n";
$col_sync->remove( { _id => $sync_info->{_id} } );
$sync_info= undef;
}
......@@ -2529,11 +2537,20 @@ sub ot2ut
print __LINE__, " ERRORS; ut: ", Dumper($ut);
print __LINE__, " generate_utheses_metadata: errors: ", Dumper($errors) if (@$errors);
my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, error_code => 'conversion_errors', error_cnt => scalar @$errors };
my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'conversion_errors', error_cnt => scalar @$errors };
push (@synced, $el);
$el->{errors}= $errors;
$col_sync->insert($el);
my $msg=
{
message => "upload error: eprint_id=[$eprint_id] lastmod=[$lastmod] [conversion errors]",
priority => 'normal',
state => 'new',
to => 'oma'
};
$col_msg->insert($msg);
my $utheses_errors_json_path= 'othes/utheses_json/errors/' . $eprint_id . '.json';
Util::JSON::write_json_file($utheses_errors_json_path, $errors);
......@@ -2564,15 +2581,7 @@ sub ot2ut
print __LINE__, " upload_cmd: [", join(' ', @upload_cmd), "]\n";
if ($do_upload)
{
=begin comment
my $upload_result= `@upload_cmd`;
print __LINE__, " upload_result=[$upload_result]\n";
=end comment
=cut
my $t_curl= time();
system(@upload_cmd);
my $result_data;
......@@ -2585,7 +2594,7 @@ sub ot2ut
{
print __LINE__, " can't parse upload_result; error=[$@]\n";
push (@$errors, { error => 'upload_error', error_info => $@ });
my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, error_code => 'upload_error', 1 };
my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'upload_error', 1 };
push (@synced, $el);
$el->{errors}= $errors;
$col_sync->insert($el);
......@@ -2609,6 +2618,7 @@ old format 2019-11..2020-01
eprint_id => $eprint_id,
lastmod => $lastmod,
ts_upload => $ts_upload,
context => $ot2ut_context,
error_code => 'ok',
error_cnt => 0,
utheses_id => $utheses_id,
......@@ -2631,6 +2641,7 @@ old format 2019-11..2020-01
eprint_id => $eprint_id,
lastmod => $lastmod,
ts_upload => $ts_upload,
context => $ot2ut_context,
error_code => 'ok',
error_cnt => 0,
utheses_id => $utheses_id,
......@@ -2639,6 +2650,17 @@ old format 2019-11..2020-01
push (@synced, $out_row);
$col_sync->insert($out_row);
my $td_start= time()-$t_start;
my $td_curl= time()-$t_curl;
my $msg=
{
message => "upload success: eprint_id=[$eprint_id] lastmod=[$lastmod] context=[$ot2ut_context] utheses_id=[$utheses_id] time_total=$td_start time_upload=$td_curl",
priority => 'normal',
state => 'new',
to => 'oma'
};
$col_msg->insert($msg);
}
sleep(5);
......@@ -2687,6 +2709,9 @@ sub generate_utheses_metadata
my $row= $all_rows->{$eprintid};
my $history= get_history($epr_db, $eprintid);
# print __LINE__, " history: ", Dumper($history); exit;
my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(fileinfo dir));
print __LINE__, " lang_pdf=[$lang_pdf] files: ", Dumper($files);
my $main_file;
......@@ -2731,8 +2756,7 @@ sub generate_utheses_metadata
my $utp= $ut->{public};
$utp->{origin}= 'import';
$utp->{datamodel}= 'container';
$utp->{uploaded_by}= 'ot2ut';
# not needed/wanted 2020-05-14: $utp->{datamodel}= 'container';
$utp->{rights_statement}= 'http://rightsstatements.org/vocab/InC/1.0/'; # "In Copyright" or "Alle Rechte vorbehalten"
if (defined ($row->{matr}))
......@@ -2795,7 +2819,12 @@ sub generate_utheses_metadata
$utp->{utheses_status}= ($row->{eprint_status} eq 'archive')
? 'published'
: 'work_in_progress'; # objects in eprint_status "buffer" are 'work_in_progress';
$utp->{utheses_status_last_modified}= get_othes_timestamp($row, 'status_changed');
# $utp->{utheses_status_last_modified}= get_othes_timestamp($row, 'status_changed');
# $utp->{phaidra_thesis_doc_added_date}= get_othes_timestamp($history->{create}, 'timestamp');
$utp->{othes}->{history_create}= get_othes_timestamp($history->{create}, 'timestamp');
$utp->{utheses_status_last_modified}= get_othes_timestamp($history->{move_buffer_to_archive}, 'timestamp');
$utp->{last_modified}= get_othes_timestamp($row, 'lastmod');
$utp->{import}= # stored verbatim in mysql table utheses_import in column import_info
{
......@@ -2862,6 +2891,7 @@ sub generate_utheses_metadata
my ($errors2, $classifications)= $epr->get_classifications($eprintid);
push (@errors, @$errors2) if (@$errors2);
# 2020-05-14 nd: not needed: $thesis->{uploaded_by}= 'ot2ut';
$thesis->{subject_classifications}= $classifications;
$thesis->{number_of_pages}= "$main_file->{page_count}"; # Phaidra expects this as a string
......@@ -2869,7 +2899,8 @@ sub generate_utheses_metadata
# Mon May 11 22:12:38 CEST 2020 asked nd about this, especially thesis_doc_added_date:
my %phaidra= map { $_ => '' } qw(container_pid container_status container_created_date thesis_doc_pid thesis_doc_status);
$phaidra{thesis_doc_added_date}= get_othes_timestamp($row, 'datestamp');
$phaidra{thesis_doc_added_date}= get_othes_timestamp($history->{create}, 'timestamp');
# $phaidra{thesis_doc_added_date}= get_othes_timestamp($row, 'datestamp');
$ut->public('phaidra', \%phaidra);
......@@ -2878,6 +2909,43 @@ sub generate_utheses_metadata
(\@errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path);
}
sub get_history
{
my $epr_db= shift;
my $eprintid= shift;
my $history_rows= $epr_db->get_all_x('history', ['objectid=?', $eprintid]);
# print __LINE__, " history_rows: ", Dumper($history_rows);
my %historyids;
my ($create, $move_buffer_to_archive);
foreach my $historyid (keys %$history_rows)
{
my $row= $history_rows->{$historyid};
# print __LINE__, " history_row: ", Dumper($row);
# NOTE: a revision can be present multiple times, so we need to sort by historyid
# $revisions{$row->{revision}}= $row;
$historyids{$historyid}= $row;
$create= $row if ($row->{action} eq 'create' && !defined ($create));
$move_buffer_to_archive= $row if ($row->{action} eq 'move_buffer_to_archive' && !defined ($move_buffer_to_archive));
}
my @historyids= sort { $a <=> $b } keys %historyids;
print __LINE__, " historyids: ", join(' ', @historyids), "\n";
my @events= map { $historyids{$_} } @historyids;
my $history=
{
events => \@events,
create => $create,
move_buffer_to_archive => $move_buffer_to_archive,
};
$history;
}
sub get_study_id
{
my $matr= shift;
......@@ -2963,11 +3031,16 @@ sub get_thesis_data
push (@j_titles, { type => 'parallel', title_lang => $lang, title_text => $title, origin => 'title' }) if ($title);
push (@j_titles, { type => 'parallel', title_lang => 'deu', title_text => $title_ger, origin => 'title_ger' }) if ($title_ger);
push (@j_titles, { type => 'parallel', title_lang => 'eng', title_text => $title_eng, origin => 'title_eng' }) if ($title_eng);
push (@j_titles, { type => 'parallel', title_lang => $lang, title_text => $title_zusatz, origin => 'title_zusatz' }) if ($title_zusatz);
@j_titles[0]->{type}= 'main';
# push (@j_titles, { type => 'parallel', title_lang => $lang, title_text => $title_zusatz, origin => 'title_zusatz' }) if ($title_zusatz);
$j_titles[0]->{type}= 'main';
if ($title_zusatz)
{
$j_titles[0]->{subtitle_text}= $title_zusatz;
$j_titles[0]->{subtitle_lang}= $lang;
};
my @j_abstracts;
push (@j_abstracts, { language => $lang, text => $abstract, origin => 'abstract' }) if ($abstract);
push (@j_abstracts, { language => 'deu', text => $abstract, origin => 'abstract' }) if ($abstract);
push (@j_abstracts, { language => 'eng', text => $abstract_eng, origin => 'abstract_eng' }) if ($abstract_eng);
my @keywords= split(/\s*\/\s*/, $keywords);
......@@ -2994,7 +3067,7 @@ sub get_thesis_data
(
languages => [ $lang ],
titles => \@j_titles,
abstract => \@j_abstracts,
abstracts => \@j_abstracts,
keywords => \@j_keywords,
type => $map_ot2ut_thesis_type{$row->{thesis_type}},
......@@ -3210,7 +3283,7 @@ sub debug_keywords
my $epr_db= $epr->connect();
my @col_names_db= qw( eprintid eprint_status sprache keywords keywords_eng );
my $search_term= "eprint_status in ('archive', 'buffer') and sprache<>'ger' and sprache<>'eng'";
# my $search_term= "eprint_status in ('archive', 'buffer') and sprache<>'ger' and sprache<>'eng'";
my $search_term= "eprint_status in ('archive', 'buffer')";
my $keys= $epr_db->get_all_x('eprint', [$search_term], join(',', @col_names_db));
......@@ -3273,6 +3346,38 @@ sub debug_keywords
Util::JSON::write_json_file('/backup/othes/eprints/test/othes_keywords.json', \%all_othes);
}
sub debug_abstracts
{
my $epr= get_eprints_db($cnf);
my $epr_db= $epr->connect();
my @col_names_db= qw( eprintid eprint_status sprache abstract abstract_eng );
# my $search_term= "eprint_status in ('archive', 'buffer') and sprache<>'ger' and sprache<>'eng'";
# my $search_term= "eprint_status in ('archive', 'buffer') and sprache='ger'";
# my $search_term= "eprint_status in ('archive', 'buffer') and sprache='eng'";
# my $search_term= "eprint_status in ('archive', 'buffer')";
my $search_term= "eprintid in (2276, 3432, 8314, 9358, 10236, 10941, 15148, 15934, 18224, 23898, 27575, 28791, 30614, 32692, 35111, 38069, 40982, 42122, 43078, 44504, 44510, 46380, 46381, 49927, 51776, 52780, 52925, 56916, 60835)";
my $keys= $epr_db->get_all_x('eprint', [$search_term], join(',', @col_names_db));
open (FO, '>:utf8', 'all_keywords.tsv') or die;
# print FO join("\t", qw( eprintid eprint_status lang n_kw kw n_kwe kwe )), "\n";
print FO join("\t", qw( eprintid eprint_status lang lang_kw n_kw kw )), "\n";
my (%all_keywords_de, %all_keywords_en);
my %all_othes;
foreach my $key (keys %$keys)
{
my $r= $keys->{$key};
print __LINE__, " key=[$key] ", Dumper($r);
my ($id, $es, $lang, $abs, $abse)= map { $r->{$_} } @col_names_db;
# $abs =~ tr/ \t\r\n/ /s;
# print join("\t", $id, $abs), "\n";
}
}
sub debug_stkz
{
my $epr= get_eprints_db($cnf);
......
......@@ -43,7 +43,7 @@ sub fetch_data
$conditions .= ' AND doi IS NULL' if ($c eq 'doi' && !$other_conditions->{doi});
}
$m->show_query(1);
# $m->show_query(1);
my $res= $m->get_all_x ('eprint', [$conditions, $eprint_status],
'eprintid,eprint_status,ac_nummer,type,matr,urn,uri,sperre,einverstaendnis,rev_number'
);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment