From 74c76d2c821b5a73886da61297cdf7f5c2440b46 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Thu, 18 Jun 2020 13:57:44 +0200 Subject: [PATCH] various updates for ot2ut --- eprints1.pl | 148 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 114 insertions(+), 34 deletions(-) diff --git a/eprints1.pl b/eprints1.pl index 50b56e9..1afc5ef 100755 --- a/eprints1.pl +++ b/eprints1.pl @@ -111,8 +111,9 @@ my $show_TODOs= 0; my $die_nbn_already_defined= 0; # ====================================================================== -# BEGIN OT2UT: Othesis to Utheses -my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize +# BEGIN OT2UT: Othes to Utheses +# my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize +my $ot2ut_context= 'ot2ut-test'; # TODO: parametrize my %map_ot2ut_roles= ( @@ -128,6 +129,7 @@ my %map_ot2ut_thesis_columns= ac_nummer => 'ac_number', pages => 'pages_scope', date_year => 'publication_date', + eprintid => 'eprint_id', ); my %map_ot2ut_json_columns= @@ -164,6 +166,8 @@ my $force= 0; my $do_upload= 0; my $db_name; my $no_doi= 0; +my $ignore_errors= 0; +my $ot2ut_eprint_status= 'archive'; if ($0 eq './ot2ut.pl') { $op_mode= 'ot2ut'; $MAX_SYNC= 1; $do_upload= 1; } @@ -214,7 +218,9 @@ while (defined ($arg= shift (@ARGV))) elsif ($opt eq 'reset') { $op_mode= $opt; } elsif ($opt eq 'force') { $force= defined($val) ? $val : 1; } elsif ($opt eq 'upload') { $do_upload= defined($val) ? $val : 1; } + elsif ($opt eq 'buffer') { $ot2ut_eprint_status= 'buffer'; $no_doi= 1; } elsif ($opt eq 'no-doi') { $no_doi= defined($val) ? $val : 1; } + elsif ($opt eq 'ignore-errors') { $ignore_errors= defined($val) ? $val : 1; } else { usage("unknown option $arg"); } } elsif ($arg =~ /^-(.+)/) @@ -2344,10 +2350,19 @@ old format... =end comment =cut - my @ts= map { $row->{$name . '_' . $_} } qw(year month day); - push (@ts, (exists ($row->{$name . '_hour'})) - ? (map { $row->{$name . '_' . $_} } qw(hour minute second)) - : (0, 0, 0)); + my @ts; + foreach my $el (qw(year month day)) + { + my $x= $row->{$name . '_' . $el}; + return undef unless (defined ($x)); + push (@ts, $x); + } + + foreach my $el (qw(hour minute second)) + { + my $f= join('_', $name, $el); + push (@ts, (exists ($row->{$f})) ? $row->{$f} : 0); + } sprintf ("%4d-%02d-%02dT%02d:%02d:%02dZ", @ts); } @@ -2480,8 +2495,25 @@ sub ot2ut unless (@eprint_ids) { print __LINE__, " fetching data\n"; - my $res1= $epr->fetch_data('archive', { doi => 1 }); - push (@eprint_ids, keys %$res1); + my $res1; + + if ($ot2ut_eprint_status eq 'archive') + { + $res1= $epr->fetch_data('archive', { doi => 1 }); + } + elsif ($ot2ut_eprint_status eq 'buffer') + { + $res1= $epr->fetch_data('buffer'); # these do not have DOIs + } + + if (defined ($res1)) + { + push (@eprint_ids, keys %$res1); + } + else + { + die "no eprints objects found"; + } # TODO, future ... # my $res2= $epr->fetch_data('buffer'); @@ -2500,13 +2532,13 @@ sub ot2ut my $sync_info= $col_sync->find_one({eprint_id => $eprint_id}); - my ($errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id); + my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id); print __LINE__, " sync_info=[$sync_info]\n"; if (defined ($sync_info)) { print __LINE__, " sync_info: ", Dumper($sync_info); - if ($sync_info->{error_code} eq 'ok') + if ($sync_info->{error_code} eq 'ok' && !$force) { # TODO: check for updates in utheses row @@ -2542,7 +2574,10 @@ sub ot2ut { # report error print __LINE__, " ERRORS; ut: ", Dumper($ut); print __LINE__, " generate_utheses_metadata: errors: ", Dumper($errors) if (@$errors); + } + if (@$errors && ! $ignore_errors) + { my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'conversion_errors', error_cnt => scalar @$errors }; push (@synced, $el); $el->{errors}= $errors; @@ -2695,6 +2730,7 @@ sub generate_utheses_metadata my $eprintid= shift; my @errors= (); + my @warnings= (); my $epr_db= $epr->connect(); my $all_rows= $epr_db->get_all_x('eprint', ['eprintid=?', $eprintid]); @@ -2892,15 +2928,17 @@ sub generate_utheses_metadata foreach my $role (keys %map_ot2ut_roles) { print __LINE__, " extracting names for role=[$role]\n"; - my ($errors, $names)= get_names_for_role($row, $map_ot2ut_roles{$role}); + my ($errors, $warnings, $names)= get_names_for_role($row, $map_ot2ut_roles{$role}); - push (@errors, @$errors) if (@$errors); + push (@errors, @$errors) if (@$errors); + push (@warnings, @$warnings) if (@$warnings); $ut->public($role, $names); } - my ($errors1, $thesis)= get_thesis_data($row); + my ($errors1, $warnings1, $thesis)= get_thesis_data($row); push (@errors, @$errors1) if (@$errors1); + push (@warnings, @$warnings1) if (@$warnings1); my ($errors2, $classifications)= $epr->get_classifications($eprintid); push (@errors, @$errors2) if (@$errors2); @@ -2920,7 +2958,7 @@ sub generate_utheses_metadata Util::JSON::write_json_file($utheses_json_path, $ut->{public}); - (\@errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path); + (\@errors, \@warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path); } sub get_history @@ -3035,7 +3073,8 @@ sub get_thesis_data { my $row= shift; - my @errors=(); + my @errors= (); + my @warnings= (); my ($lang, $abstract, $abstract_eng, $title, $title_ger, $title_eng, $title_zusatz, $keywords, $keywords_eng)= map { my $x= $row->{$_}; $x=~ s#\r##g; $x=~ s#^\s*##; $x=~ s#\s*$##; $x; } @@ -3060,8 +3099,11 @@ sub get_thesis_data push (@j_abstracts, { language => 'deu', text => $abstract, origin => 'abstract' }) if ($abstract); push (@j_abstracts, { language => 'eng', text => $abstract_eng, origin => 'abstract_eng' }) if ($abstract_eng); - my @keywords= split(/\s*\/\s*/, $keywords); - my @keywords_eng= split(/\s*\/\s*/, $keywords_eng); + # NOTE: cleanup_keywords() returns a list of cleaned up keywords! + # my @keywords= split(/\s*\/\s*/, $keywords); + # my @keywords_eng= split(/\s*\/\s*/, $keywords_eng); + my ($n_kw, $l_kw)= cleanup_keywords($keywords); + my ($n_kwe, $l_kwe)= cleanup_keywords($keywords_eng); my @j_keywords; # NOTE: each keyword as a separate element @@ -3072,13 +3114,17 @@ sub get_thesis_data # push (@j_keywords, map { { language => 'deu', text => $_ } } @keywords) if (@keywords); # 2020-01-31: new schema: text is now an array reference - push (@j_keywords, { language => 'eng', text => \@keywords_eng }) if (@keywords_eng); - push (@j_keywords, { language => 'deu', text => \@keywords }) if (@keywords); + # push (@j_keywords, { language => 'eng', text => \@keywords_eng }) if (@keywords_eng); + # push (@j_keywords, { language => 'deu', text => \@keywords }) if (@keywords); # NOTE: all keywords separated by comma # push (@j_keywords, { language => 'eng', text => join(', ', @keywords_eng) }) if (@keywords_eng); # push (@j_keywords, { language => 'deu', text => join(', ', @keywords) }) if (@keywords); + # 2020-06-15: use cleaned keyword lists + push (@j_keywords, { language => 'eng', text => $l_kwe }) if (@$l_kwe); + push (@j_keywords, { language => 'deu', text => $l_kw }) if (@$l_kw); + # TODO: language logic needs to be improved, this is plain bad. my %thesis= ( @@ -3144,10 +3190,23 @@ mysql> select count(*), abstract_nicht_anzeigen from eprint group by abstract_ni next if ($av eq ''); $thesis{$an2}= $av; } - $thesis{assessment_date}= get_othes_timestamp($row, 'date_app'); + + my $assessment_date= get_othes_timestamp($row, 'date_app'); + + if (defined ($assessment_date)) + { + $thesis{assessment_date}= $assessment_date; + } + else + { # 2020-05-29 15:12 nd Commented on gg's message: @nd hast du meine messages bezueglich eprint_id=1982 gesehen? dort ist date_app_year NULL, d.h. es gibt kein assessment date; wir haben 95 solcher objekte ... + # diese objekte sollten wir uns notieren. bitte stattdessen einen leeren string uebergeben. + + push (@warnings, { warning => 'date_app missing, can not assign assessment_date' }); + $thesis{assessment_date}= ''; + } print __LINE__, " thesis: ", Dumper (\%thesis); - (\@errors, \%thesis); + (\@errors, \@warnings, \%thesis); } sub get_names_for_role @@ -3157,7 +3216,9 @@ sub get_names_for_role print __LINE__, " column_names: ", Dumper($column_names); - my @errors=(); + my @errors= (); + my @warnings= (); + my @result; foreach my $column_name (@$column_names) { @@ -3174,8 +3235,9 @@ sub get_names_for_role print __LINE__, " column_name=[$column_name] name=[$name] nn=[$nn] vn=[$vn]\n"; if ($vn eq '' || !($vn =~ m#^\U\E[\w\-\x{2010} ]+\.?$#) || !($nn =~ m#^\U\E[\w\-\x{2010} ]+$#)) - { + { # TODO: add option to flag this as a warning instead of as an error push (@errors, { error => 'bad_name', column_name => $column_name, name => $name } ); + push (@result, { family_name => $name }); # fill everything in into family_name } else { @@ -3184,7 +3246,7 @@ sub get_names_for_role } } - (\@errors, \@result); + (\@errors, \@warnings, \@result); } sub analyze_files @@ -3500,26 +3562,25 @@ sub cleanup_keywords my @notes; push (@notes, 'kw_ws_eol') if ($s =~ s#[\t\s\n]+$##g); # ignore spaces at the end - push (@notes, 'kw_delim_eol') if ($s =~ s#[,;/]+$##g); # ignore delimiters at the end + push (@notes, 'kw_delim_eol') if ($s =~ s#[\.,;/]+$##g); # ignore delimiters at the end push (@notes, 'kw_lf') if ($s =~ s#[\r]+##g); # push (@notes, 'kw_tab') if ($s =~ s#[\t]+# #g); # tabs are blanks push (@notes, 'kw_nl') if ($s =~ s#\n+# / #g); # newline as delimiter $s=~ s/^\s*//; # $s=~ s/\s*$//; - $s=~ s/ +/ /g; +# $s=~ s/ +/ /g; return (['kw_empty'], []) if ($s eq '' or $s =~ /^\s*nicht\s*angegeben\s*\.*\s*$/); my @keywords; - my @kw1= split('\s+/\s+', $s); - my @kw2= split('\s*;\s*', $s); - my @kw3= split('\s*,\s*', $s); - - # print __LINE__, " kw3: ", Dumper(\@kw3); - + my @kw1= split('\s+/[\s\/]+', $s); # "foo / / bar" should be only two fields if (@kw1 == 1) { + my @kw2= split('\s*;\s*', $s); + my @kw3= split('\s*,\s*', $s); + # print __LINE__, " kw3: ", Dumper(\@kw3); + if (@kw2 > 1) { push (@notes, 'kw_semicolon'); @@ -3533,13 +3594,32 @@ sub cleanup_keywords } else { - push (@notes, 'kw_single'); - @keywords= @kw1; + my @kw4= split('\s*/\s+', $s); + if (@kw4 > 1) + { + @keywords= @kw4; + push (@notes, 'kw_slash2'); + } + else + { + my @kw5= split('\s*/\s*', $s); + if (@kw5 > 1) + { + @keywords= @kw4; + push (@notes, 'kw_slash3'); + } + else + { + push (@notes, 'kw_single'); + @keywords= @kw1; + } + } } } else { @keywords= @kw1; + push (@notes, 'kw_slash'); } # print __LINE__, " keywords: ", Dumper (\@keywords); -- GitLab