From 74c76d2c821b5a73886da61297cdf7f5c2440b46 Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Thu, 18 Jun 2020 13:57:44 +0200
Subject: [PATCH] various updates for ot2ut

---
 eprints1.pl | 148 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 114 insertions(+), 34 deletions(-)

diff --git a/eprints1.pl b/eprints1.pl
index 50b56e9..1afc5ef 100755
--- a/eprints1.pl
+++ b/eprints1.pl
@@ -111,8 +111,9 @@ my $show_TODOs= 0;
 my $die_nbn_already_defined= 0;
 
 # ======================================================================
-# BEGIN OT2UT: Othesis to Utheses 
-my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize
+# BEGIN OT2UT: Othes to Utheses 
+# my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize
+my $ot2ut_context= 'ot2ut-test'; # TODO: parametrize
 
 my %map_ot2ut_roles=
 (
@@ -128,6 +129,7 @@ my %map_ot2ut_thesis_columns=
   ac_nummer => 'ac_number',
   pages     => 'pages_scope',
   date_year => 'publication_date',
+  eprintid  => 'eprint_id',
 );
 
 my %map_ot2ut_json_columns=
@@ -164,6 +166,8 @@ my $force= 0;
 my $do_upload= 0;
 my $db_name;
 my $no_doi= 0;
+my $ignore_errors= 0;
+my $ot2ut_eprint_status= 'archive';
 
 if ($0 eq './ot2ut.pl') { $op_mode= 'ot2ut'; $MAX_SYNC= 1; $do_upload= 1; }
 
@@ -214,7 +218,9 @@ while (defined ($arg= shift (@ARGV)))
     elsif ($opt eq 'reset')   { $op_mode= $opt;  }
     elsif ($opt eq 'force')   { $force= defined($val) ? $val : 1; }
     elsif ($opt eq 'upload')  { $do_upload= defined($val) ? $val : 1; }
+    elsif ($opt eq 'buffer')  { $ot2ut_eprint_status= 'buffer'; $no_doi= 1; }
     elsif ($opt eq 'no-doi')  { $no_doi= defined($val) ? $val : 1; }
+    elsif ($opt eq 'ignore-errors') { $ignore_errors= defined($val) ? $val : 1; }
     else { usage("unknown option $arg"); }
   }
   elsif ($arg =~ /^-(.+)/)
@@ -2344,10 +2350,19 @@ old format...
 =end comment
 =cut
 
-  my @ts= map { $row->{$name . '_' . $_} } qw(year month day);
-  push (@ts, (exists ($row->{$name . '_hour'}))
-             ? (map { $row->{$name . '_' . $_} } qw(hour minute second))
-             : (0, 0, 0));
+  my @ts;
+  foreach my $el (qw(year month day))
+  {
+    my $x= $row->{$name . '_' . $el};
+    return undef unless (defined ($x));
+    push (@ts, $x);
+  }
+
+  foreach my $el (qw(hour minute second))
+  {
+    my $f= join('_', $name, $el);
+    push (@ts, (exists ($row->{$f})) ? $row->{$f} : 0);
+  }
 
   sprintf ("%4d-%02d-%02dT%02d:%02d:%02dZ", @ts);
 }
@@ -2480,8 +2495,25 @@ sub ot2ut
   unless (@eprint_ids)
   {
     print __LINE__, " fetching data\n";
-    my $res1= $epr->fetch_data('archive', { doi => 1 });
-    push (@eprint_ids, keys %$res1);
+    my $res1;
+
+    if ($ot2ut_eprint_status eq 'archive')
+    {
+      $res1= $epr->fetch_data('archive', { doi => 1 });
+    }
+    elsif ($ot2ut_eprint_status eq 'buffer')
+    {
+      $res1= $epr->fetch_data('buffer'); # these do not have DOIs
+    }
+
+    if (defined ($res1))
+    {
+      push (@eprint_ids, keys %$res1);
+    }
+    else
+    {
+      die "no eprints objects found";
+    }
 
     # TODO, future ...
     # my $res2= $epr->fetch_data('buffer');
@@ -2500,13 +2532,13 @@ sub ot2ut
 
     my $sync_info= $col_sync->find_one({eprint_id => $eprint_id});
 
-    my ($errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id);
+    my ($errors, $warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id);
 
     print __LINE__, " sync_info=[$sync_info]\n";
     if (defined ($sync_info))
     {
       print __LINE__, " sync_info: ", Dumper($sync_info);
-      if ($sync_info->{error_code} eq 'ok')
+      if ($sync_info->{error_code} eq 'ok' && !$force)
       {
         # TODO: check for updates in utheses row
 
@@ -2542,7 +2574,10 @@ sub ot2ut
     { # report error
       print __LINE__, " ERRORS; ut: ", Dumper($ut);
       print __LINE__, " generate_utheses_metadata: errors: ", Dumper($errors) if (@$errors);
+    }
 
+    if (@$errors && ! $ignore_errors)
+    {
       my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'conversion_errors', error_cnt => scalar @$errors };
       push (@synced, $el);
       $el->{errors}= $errors;
@@ -2695,6 +2730,7 @@ sub generate_utheses_metadata
   my $eprintid= shift;
 
   my @errors= ();
+  my @warnings= ();
 
   my $epr_db= $epr->connect();
   my $all_rows= $epr_db->get_all_x('eprint', ['eprintid=?', $eprintid]);
@@ -2892,15 +2928,17 @@ sub generate_utheses_metadata
   foreach my $role (keys %map_ot2ut_roles)
   {
     print __LINE__, " extracting names for role=[$role]\n";
-    my ($errors, $names)= get_names_for_role($row, $map_ot2ut_roles{$role});
+    my ($errors, $warnings, $names)= get_names_for_role($row, $map_ot2ut_roles{$role});
 
-    push (@errors, @$errors) if (@$errors);
+    push (@errors,   @$errors)   if (@$errors);
+    push (@warnings, @$warnings) if (@$warnings);
 
     $ut->public($role, $names);
   }
 
-  my ($errors1, $thesis)= get_thesis_data($row);
+  my ($errors1, $warnings1, $thesis)= get_thesis_data($row);
   push (@errors, @$errors1) if (@$errors1);
+  push (@warnings, @$warnings1) if (@$warnings1);
 
   my ($errors2, $classifications)= $epr->get_classifications($eprintid);
   push (@errors, @$errors2) if (@$errors2);
@@ -2920,7 +2958,7 @@ sub generate_utheses_metadata
 
   Util::JSON::write_json_file($utheses_json_path, $ut->{public});
 
-  (\@errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path);
+  (\@errors, \@warnings, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path);
 }
 
 sub get_history
@@ -3035,7 +3073,8 @@ sub get_thesis_data
 {
   my $row= shift;
 
-  my @errors=();
+  my @errors= ();
+  my @warnings= ();
 
   my ($lang, $abstract, $abstract_eng, $title, $title_ger, $title_eng, $title_zusatz, $keywords, $keywords_eng)=
      map { my $x= $row->{$_}; $x=~ s#\r##g; $x=~ s#^\s*##; $x=~ s#\s*$##; $x; }
@@ -3060,8 +3099,11 @@ sub get_thesis_data
   push (@j_abstracts, { language => 'deu', text => $abstract,     origin => 'abstract'     }) if ($abstract);
   push (@j_abstracts, { language => 'eng', text => $abstract_eng, origin => 'abstract_eng' }) if ($abstract_eng);
 
-  my @keywords=     split(/\s*\/\s*/, $keywords);
-  my @keywords_eng= split(/\s*\/\s*/, $keywords_eng);
+  # NOTE: cleanup_keywords() returns a list of cleaned up keywords!
+  # my @keywords=     split(/\s*\/\s*/, $keywords);
+  # my @keywords_eng= split(/\s*\/\s*/, $keywords_eng);
+  my ($n_kw,  $l_kw)=  cleanup_keywords($keywords);
+  my ($n_kwe, $l_kwe)= cleanup_keywords($keywords_eng);
 
   my @j_keywords;
   # NOTE: each keyword as a separate element
@@ -3072,13 +3114,17 @@ sub get_thesis_data
   # push (@j_keywords, map { { language => 'deu', text => $_ } } @keywords)     if (@keywords);
 
   # 2020-01-31: new schema: text is now an array reference
-  push (@j_keywords, { language => 'eng', text => \@keywords_eng }) if (@keywords_eng);
-  push (@j_keywords, { language => 'deu', text => \@keywords     }) if (@keywords);
+  # push (@j_keywords, { language => 'eng', text => \@keywords_eng }) if (@keywords_eng);
+  # push (@j_keywords, { language => 'deu', text => \@keywords     }) if (@keywords);
 
   # NOTE: all keywords separated by comma
   # push (@j_keywords, { language => 'eng', text => join(', ', @keywords_eng) }) if (@keywords_eng);
   # push (@j_keywords, { language => 'deu', text => join(', ', @keywords)     }) if (@keywords);
 
+  # 2020-06-15: use cleaned keyword lists
+  push (@j_keywords, { language => 'eng', text => $l_kwe }) if (@$l_kwe);
+  push (@j_keywords, { language => 'deu', text => $l_kw  }) if (@$l_kw);
+
   # TODO: language logic needs to be improved, this is plain bad.
   my %thesis=
   (
@@ -3144,10 +3190,23 @@ mysql> select count(*), abstract_nicht_anzeigen from eprint group by abstract_ni
     next if ($av eq '');
     $thesis{$an2}= $av;
   }
-  $thesis{assessment_date}= get_othes_timestamp($row, 'date_app');
+
+  my $assessment_date= get_othes_timestamp($row, 'date_app');
+
+  if (defined ($assessment_date))
+  {
+    $thesis{assessment_date}= $assessment_date;
+  }
+  else
+  { # 2020-05-29 15:12 nd Commented on gg's message: @nd hast du meine messages bezueglich eprint_id=1982 gesehen? dort ist date_app_year NULL, d.h. es gibt kein assessment date; wir haben 95 solcher objekte ...
+    # diese objekte sollten wir uns notieren. bitte stattdessen einen leeren string uebergeben.
+
+    push (@warnings,  { warning => 'date_app missing, can not assign assessment_date' });
+    $thesis{assessment_date}= '';
+  }
 
 print __LINE__, " thesis: ", Dumper (\%thesis);
-  (\@errors, \%thesis);
+  (\@errors, \@warnings, \%thesis);
 }
 
 sub get_names_for_role
@@ -3157,7 +3216,9 @@ sub get_names_for_role
 
   print __LINE__, " column_names: ", Dumper($column_names);
 
-  my @errors=();
+  my @errors= ();
+  my @warnings= ();
+
   my @result;
   foreach my $column_name (@$column_names)
   {
@@ -3174,8 +3235,9 @@ sub get_names_for_role
       print __LINE__, " column_name=[$column_name] name=[$name] nn=[$nn] vn=[$vn]\n";
 
       if ($vn eq '' || !($vn =~ m#^\U\E[\w\-\x{2010} ]+\.?$#) || !($nn =~ m#^\U\E[\w\-\x{2010} ]+$#))
-      {
+      { # TODO: add option to flag this as a warning instead of as an error
         push (@errors, { error => 'bad_name', column_name => $column_name, name => $name } );
+        push (@result, { family_name => $name }); # fill everything in into family_name
       }
       else
       {
@@ -3184,7 +3246,7 @@ sub get_names_for_role
     }
   }
 
-  (\@errors, \@result);
+  (\@errors, \@warnings, \@result);
 }
 
 sub analyze_files
@@ -3500,26 +3562,25 @@ sub cleanup_keywords
   my @notes;
 
   push (@notes, 'kw_ws_eol')    if ($s =~ s#[\t\s\n]+$##g);    # ignore spaces at the end
-  push (@notes, 'kw_delim_eol') if ($s =~ s#[,;/]+$##g); # ignore delimiters at the end
+  push (@notes, 'kw_delim_eol') if ($s =~ s#[\.,;/]+$##g); # ignore delimiters at the end
   push (@notes, 'kw_lf')        if ($s =~ s#[\r]+##g);  # 
   push (@notes, 'kw_tab')       if ($s =~ s#[\t]+# #g); # tabs are blanks
   push (@notes, 'kw_nl')        if ($s =~ s#\n+# / #g); # newline as delimiter
   $s=~ s/^\s*//;
 # $s=~ s/\s*$//;
-  $s=~ s/ +/ /g;
+# $s=~ s/ +/ /g;
 
   return (['kw_empty'], []) if ($s eq '' or $s =~ /^\s*nicht\s*angegeben\s*\.*\s*$/);
 
   my @keywords;
 
-  my @kw1= split('\s+/\s+', $s);
-  my @kw2= split('\s*;\s*', $s);
-  my @kw3= split('\s*,\s*', $s);
-
-  # print __LINE__, " kw3: ", Dumper(\@kw3);
-
+  my @kw1= split('\s+/[\s\/]+', $s); # "foo / / bar" should be only two fields
   if (@kw1 == 1)
   {
+    my @kw2= split('\s*;\s*', $s);
+    my @kw3= split('\s*,\s*', $s);
+    # print __LINE__, " kw3: ", Dumper(\@kw3);
+
     if (@kw2 > 1)
     {
       push (@notes, 'kw_semicolon');
@@ -3533,13 +3594,32 @@ sub cleanup_keywords
     }
     else
     {
-      push (@notes, 'kw_single');
-      @keywords= @kw1;
+      my @kw4= split('\s*/\s+', $s);
+      if (@kw4 > 1)
+      {
+        @keywords= @kw4;
+        push (@notes, 'kw_slash2');
+      }
+      else
+      {
+        my @kw5= split('\s*/\s*', $s);
+        if (@kw5 > 1)
+        {
+          @keywords= @kw4;
+          push (@notes, 'kw_slash3');
+        }
+        else
+        {
+          push (@notes, 'kw_single');
+          @keywords= @kw1;
+        }
+      }
     }
   }
   else
   {
     @keywords= @kw1;
+    push (@notes, 'kw_slash');
   }
 
 # print __LINE__, " keywords: ", Dumper (\@keywords);
-- 
GitLab