Skip to content
Snippets Groups Projects
Commit 74c76d2c authored by Gerhard Gonter's avatar Gerhard Gonter :speech_balloon:
Browse files

various updates for ot2ut

parent 285c5c08
No related branches found
No related tags found
No related merge requests found
......@@ -111,8 +111,9 @@ my $show_TODOs= 0;
my $die_nbn_already_defined= 0;
# ======================================================================
# BEGIN OT2UT: Othesis to Utheses
my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize
# BEGIN OT2UT: Othes to Utheses
# my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize
my $ot2ut_context= 'ot2ut-test'; # TODO: parametrize
my %map_ot2ut_roles=
(
......@@ -128,6 +129,7 @@ my %map_ot2ut_thesis_columns=
ac_nummer => 'ac_number',
pages => 'pages_scope',
date_year => 'publication_date',
eprintid => 'eprint_id',
);
my %map_ot2ut_json_columns=
......@@ -164,6 +166,8 @@ my $force= 0;
my $do_upload= 0;
my $db_name;
my $no_doi= 0;
my $ignore_errors= 0;
my $ot2ut_eprint_status= 'archive';
if ($0 eq './ot2ut.pl') { $op_mode= 'ot2ut'; $MAX_SYNC= 1; $do_upload= 1; }
......@@ -214,7 +218,9 @@ while (defined ($arg= shift (@ARGV)))
elsif ($opt eq 'reset') { $op_mode= $opt; }
elsif ($opt eq 'force') { $force= defined($val) ? $val : 1; }
elsif ($opt eq 'upload') { $do_upload= defined($val) ? $val : 1; }
elsif ($opt eq 'buffer') { $ot2ut_eprint_status= 'buffer'; $no_doi= 1; }
elsif ($opt eq 'no-doi') { $no_doi= defined($val) ? $val : 1; }
elsif ($opt eq 'ignore-errors') { $ignore_errors= defined($val) ? $val : 1; }
else { usage("unknown option $arg"); }
}
elsif ($arg =~ /^-(.+)/)
......@@ -2344,10 +2350,19 @@ old format...
=end comment
=cut
my @ts= map { $row->{$name . '_' . $_} } qw(year month day);
push (@ts, (exists ($row->{$name . '_hour'}))
? (map { $row->{$name . '_' . $_} } qw(hour minute second))
: (0, 0, 0));
my @ts;
foreach my $el (qw(year month day))
{
my $x= $row->{$name . '_' . $el};
return undef unless (defined ($x));
push (@ts, $x);
}
foreach my $el (qw(hour minute second))
{
my $f= join('_', $name, $el);
push (@ts, (exists ($row->{$f})) ? $row->{$f} : 0);
}
sprintf ("%4d-%02d-%02dT%02d:%02d:%02dZ", @ts);
}
......@@ -2480,8 +2495,25 @@ sub ot2ut
unless (@eprint_ids)
{
print __LINE__, " fetching data\n";
my $res1= $epr->fetch_data('archive', { doi => 1 });
my $res1;
if ($ot2ut_eprint_status eq 'archive')
{
$res1= $epr->fetch_data('archive', { doi => 1 });
}
elsif ($ot2ut_eprint_status eq 'buffer')
{
$res1= $epr->fetch_data('buffer'); # these do not have DOIs
}
if (defined ($res1))
{
push (@eprint_ids, keys %$res1);
}
else
{
die "no eprints objects found";
}
# TODO, future ...
# my $res2= $epr->fetch_data('buffer');
......@@ -2500,13 +2532,13 @@ sub ot2ut
my $sync_info= $col_sync->find_one({eprint_id => $eprint_id});
my ($errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id);
my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id);
print __LINE__, " sync_info=[$sync_info]\n";
if (defined ($sync_info))
{
print __LINE__, " sync_info: ", Dumper($sync_info);
if ($sync_info->{error_code} eq 'ok')
if ($sync_info->{error_code} eq 'ok' && !$force)
{
# TODO: check for updates in utheses row
......@@ -2542,7 +2574,10 @@ sub ot2ut
{ # report error
print __LINE__, " ERRORS; ut: ", Dumper($ut);
print __LINE__, " generate_utheses_metadata: errors: ", Dumper($errors) if (@$errors);
}
if (@$errors && ! $ignore_errors)
{
my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'conversion_errors', error_cnt => scalar @$errors };
push (@synced, $el);
$el->{errors}= $errors;
......@@ -2695,6 +2730,7 @@ sub generate_utheses_metadata
my $eprintid= shift;
my @errors= ();
my @warnings= ();
my $epr_db= $epr->connect();
my $all_rows= $epr_db->get_all_x('eprint', ['eprintid=?', $eprintid]);
......@@ -2892,15 +2928,17 @@ sub generate_utheses_metadata
foreach my $role (keys %map_ot2ut_roles)
{
print __LINE__, " extracting names for role=[$role]\n";
my ($errors, $names)= get_names_for_role($row, $map_ot2ut_roles{$role});
my ($errors, $warnings, $names)= get_names_for_role($row, $map_ot2ut_roles{$role});
push (@errors, @$errors) if (@$errors);
push (@warnings, @$warnings) if (@$warnings);
$ut->public($role, $names);
}
my ($errors1, $thesis)= get_thesis_data($row);
my ($errors1, $warnings1, $thesis)= get_thesis_data($row);
push (@errors, @$errors1) if (@$errors1);
push (@warnings, @$warnings1) if (@$warnings1);
my ($errors2, $classifications)= $epr->get_classifications($eprintid);
push (@errors, @$errors2) if (@$errors2);
......@@ -2920,7 +2958,7 @@ sub generate_utheses_metadata
Util::JSON::write_json_file($utheses_json_path, $ut->{public});
(\@errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path);
(\@errors, \@warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path);
}
sub get_history
......@@ -3036,6 +3074,7 @@ sub get_thesis_data
my $row= shift;
my @errors= ();
my @warnings= ();
my ($lang, $abstract, $abstract_eng, $title, $title_ger, $title_eng, $title_zusatz, $keywords, $keywords_eng)=
map { my $x= $row->{$_}; $x=~ s#\r##g; $x=~ s#^\s*##; $x=~ s#\s*$##; $x; }
......@@ -3060,8 +3099,11 @@ sub get_thesis_data
push (@j_abstracts, { language => 'deu', text => $abstract, origin => 'abstract' }) if ($abstract);
push (@j_abstracts, { language => 'eng', text => $abstract_eng, origin => 'abstract_eng' }) if ($abstract_eng);
my @keywords= split(/\s*\/\s*/, $keywords);
my @keywords_eng= split(/\s*\/\s*/, $keywords_eng);
# NOTE: cleanup_keywords() returns a list of cleaned up keywords!
# my @keywords= split(/\s*\/\s*/, $keywords);
# my @keywords_eng= split(/\s*\/\s*/, $keywords_eng);
my ($n_kw, $l_kw)= cleanup_keywords($keywords);
my ($n_kwe, $l_kwe)= cleanup_keywords($keywords_eng);
my @j_keywords;
# NOTE: each keyword as a separate element
......@@ -3072,13 +3114,17 @@ sub get_thesis_data
# push (@j_keywords, map { { language => 'deu', text => $_ } } @keywords) if (@keywords);
# 2020-01-31: new schema: text is now an array reference
push (@j_keywords, { language => 'eng', text => \@keywords_eng }) if (@keywords_eng);
push (@j_keywords, { language => 'deu', text => \@keywords }) if (@keywords);
# push (@j_keywords, { language => 'eng', text => \@keywords_eng }) if (@keywords_eng);
# push (@j_keywords, { language => 'deu', text => \@keywords }) if (@keywords);
# NOTE: all keywords separated by comma
# push (@j_keywords, { language => 'eng', text => join(', ', @keywords_eng) }) if (@keywords_eng);
# push (@j_keywords, { language => 'deu', text => join(', ', @keywords) }) if (@keywords);
# 2020-06-15: use cleaned keyword lists
push (@j_keywords, { language => 'eng', text => $l_kwe }) if (@$l_kwe);
push (@j_keywords, { language => 'deu', text => $l_kw }) if (@$l_kw);
# TODO: language logic needs to be improved, this is plain bad.
my %thesis=
(
......@@ -3144,10 +3190,23 @@ mysql> select count(*), abstract_nicht_anzeigen from eprint group by abstract_ni
next if ($av eq '');
$thesis{$an2}= $av;
}
$thesis{assessment_date}= get_othes_timestamp($row, 'date_app');
my $assessment_date= get_othes_timestamp($row, 'date_app');
if (defined ($assessment_date))
{
$thesis{assessment_date}= $assessment_date;
}
else
{ # 2020-05-29 15:12 nd Commented on gg's message: @nd hast du meine messages bezueglich eprint_id=1982 gesehen? dort ist date_app_year NULL, d.h. es gibt kein assessment date; wir haben 95 solcher objekte ...
# diese objekte sollten wir uns notieren. bitte stattdessen einen leeren string uebergeben.
push (@warnings, { warning => 'date_app missing, can not assign assessment_date' });
$thesis{assessment_date}= '';
}
print __LINE__, " thesis: ", Dumper (\%thesis);
(\@errors, \%thesis);
(\@errors, \@warnings, \%thesis);
}
sub get_names_for_role
......@@ -3158,6 +3217,8 @@ sub get_names_for_role
print __LINE__, " column_names: ", Dumper($column_names);
my @errors= ();
my @warnings= ();
my @result;
foreach my $column_name (@$column_names)
{
......@@ -3174,8 +3235,9 @@ sub get_names_for_role
print __LINE__, " column_name=[$column_name] name=[$name] nn=[$nn] vn=[$vn]\n";
if ($vn eq '' || !($vn =~ m#^\U\E[\w\-\x{2010} ]+\.?$#) || !($nn =~ m#^\U\E[\w\-\x{2010} ]+$#))
{
{ # TODO: add option to flag this as a warning instead of as an error
push (@errors, { error => 'bad_name', column_name => $column_name, name => $name } );
push (@result, { family_name => $name }); # fill everything in into family_name
}
else
{
......@@ -3184,7 +3246,7 @@ sub get_names_for_role
}
}
(\@errors, \@result);
(\@errors, \@warnings, \@result);
}
sub analyze_files
......@@ -3500,26 +3562,25 @@ sub cleanup_keywords
my @notes;
push (@notes, 'kw_ws_eol') if ($s =~ s#[\t\s\n]+$##g); # ignore spaces at the end
push (@notes, 'kw_delim_eol') if ($s =~ s#[,;/]+$##g); # ignore delimiters at the end
push (@notes, 'kw_delim_eol') if ($s =~ s#[\.,;/]+$##g); # ignore delimiters at the end
push (@notes, 'kw_lf') if ($s =~ s#[\r]+##g); #
push (@notes, 'kw_tab') if ($s =~ s#[\t]+# #g); # tabs are blanks
push (@notes, 'kw_nl') if ($s =~ s#\n+# / #g); # newline as delimiter
$s=~ s/^\s*//;
# $s=~ s/\s*$//;
$s=~ s/ +/ /g;
# $s=~ s/ +/ /g;
return (['kw_empty'], []) if ($s eq '' or $s =~ /^\s*nicht\s*angegeben\s*\.*\s*$/);
my @keywords;
my @kw1= split('\s+/\s+', $s);
my @kw1= split('\s+/[\s\/]+', $s); # "foo / / bar" should be only two fields
if (@kw1 == 1)
{
my @kw2= split('\s*;\s*', $s);
my @kw3= split('\s*,\s*', $s);
# print __LINE__, " kw3: ", Dumper(\@kw3);
if (@kw1 == 1)
{
if (@kw2 > 1)
{
push (@notes, 'kw_semicolon');
......@@ -3532,14 +3593,33 @@ sub cleanup_keywords
# print __LINE__, " KW3\n";
}
else
{
my @kw4= split('\s*/\s+', $s);
if (@kw4 > 1)
{
@keywords= @kw4;
push (@notes, 'kw_slash2');
}
else
{
my @kw5= split('\s*/\s*', $s);
if (@kw5 > 1)
{
@keywords= @kw4;
push (@notes, 'kw_slash3');
}
else
{
push (@notes, 'kw_single');
@keywords= @kw1;
}
}
}
}
else
{
@keywords= @kw1;
push (@notes, 'kw_slash');
}
# print __LINE__, " keywords: ", Dumper (\@keywords);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment