modifications for utheses uploads

0b8aeced · Gerhard Gonter · e50bdf16 · 0b8aeced · 0b8aeced
Commit 0b8aeced authored May 22, 2020 by Gerhard Gonter
--- a/eprints1.pl
+++ b/eprints1.pl
@@ -115,8 +115,8 @@ my $ot2ut_context= 'ot2ut-entw'; # TODO: parametrize

 my %map_ot2ut_roles=
 (
-  'advisors' => [qw(betreuer betreuer_2 betreuer_3)],
-  'coadvisors' => [qw(mitbetreuer mitbetreuer_2)],
+  'advisers' => [qw(betreuer betreuer_2 betreuer_3)],
+  'coadvisers' => [qw(mitbetreuer mitbetreuer_2)],
  'assessors' => [qw(beurteiler_1 beurteiler_2 beurteiler_3)],
 );

@@ -203,6 +203,7 @@ while (defined ($arg= shift (@ARGV)))
    elsif ($opt eq 'debug_names')   { $op_mode= 'debug_names'; }
    elsif ($opt eq 'debug_classifications' || $opt eq 'DC')   { $op_mode= 'debug_classifications'; }
    elsif ($opt eq 'debug_keywords') { $op_mode= 'debug_keywords'; }
+    elsif ($opt eq 'debug_abstracts') { $op_mode= 'debug_abstracts'; }
    elsif ($opt eq 'debug_stkz')     { $op_mode= 'debug_stkz'; }
    elsif ($opt eq 'max')     { $MAX_SYNC= $val || shift (@ARGV); }
    elsif ($opt eq 'mab-age') { $MAX_MAB_AGE= $val || shift (@ARGV); } # in seconds
@@ -341,6 +342,10 @@ elsif ($op_mode eq 'debug_keywords')
 {
  debug_keywords();
 }
+elsif ($op_mode eq 'debug_abstracts')
+{
+  debug_abstracts();
+}
 elsif ($op_mode eq 'debug_classifications')
 {
  # print "cnf: ", Dumper ($cnf);
@@ -2453,6 +2458,7 @@ sub ot2ut

  my $ot2ut= get_any_db($cnf, 'ot2ut_database');
  my $col_sync= $ot2ut->get_collection('sync');
+  my $col_msg= $ot2ut->get_collection('messages');

  unless (defined ($utheses_faculty_map))
  {
@@ -2484,6 +2490,8 @@ sub ot2ut
  {
    last if (defined ($MAX_SYNC) && $cnt_synced >= $MAX_SYNC);

+    my $t_start= time();
+
    my $sync_info= $col_sync->find_one({eprint_id => $eprint_id});

    my ($errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path)= generate_utheses_metadata($epr, $eprint_id);
@@ -2511,7 +2519,7 @@ sub ot2ut
      }
      else
      {
-        print __LINE__, " earlier sync attempt had errors, retgrying...\n";
+        print __LINE__, " earlier sync attempt had errors, retrying...\n";
        $col_sync->remove( { _id => $sync_info->{_id} } );
        $sync_info= undef;
      }
@@ -2529,11 +2537,20 @@ sub ot2ut
      print __LINE__, " ERRORS; ut: ", Dumper($ut);
      print __LINE__, " generate_utheses_metadata: errors: ", Dumper($errors) if (@$errors);

-      my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, error_code => 'conversion_errors', error_cnt => scalar @$errors };
+      my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'conversion_errors', error_cnt => scalar @$errors };
      push (@synced, $el);
      $el->{errors}= $errors;
      $col_sync->insert($el);

+      my $msg=
+      {
+        message => "upload error: eprint_id=[$eprint_id] lastmod=[$lastmod] [conversion errors]",
+        priority => 'normal',
+        state => 'new',
+        to => 'oma'
+      };
+      $col_msg->insert($msg);
+
      my $utheses_errors_json_path= 'othes/utheses_json/errors/' . $eprint_id . '.json';
      Util::JSON::write_json_file($utheses_errors_json_path, $errors);

@@ -2564,15 +2581,7 @@ sub ot2ut
      print __LINE__, " upload_cmd: [", join(' ', @upload_cmd), "]\n";
      if ($do_upload)
      {
-
-=begin comment
-
-        my $upload_result= `@upload_cmd`;
-        print __LINE__, " upload_result=[$upload_result]\n";
-
-=end comment
-=cut
-
+        my $t_curl= time();
        system(@upload_cmd);

        my $result_data;
@@ -2585,7 +2594,7 @@ sub ot2ut
        {
          print __LINE__, " can't parse upload_result; error=[$@]\n";
          push (@$errors, { error => 'upload_error', error_info => $@ });
-          my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, error_code => 'upload_error', 1 };
+          my $el= { eprint_id => $eprint_id, lastmod => $lastmod, ts_upload => $ts_upload, context => $ot2ut_context, error_code => 'upload_error', 1 };
          push (@synced, $el);
          $el->{errors}= $errors;
          $col_sync->insert($el);
@@ -2609,6 +2618,7 @@ old format 2019-11..2020-01
            eprint_id        => $eprint_id,
            lastmod          => $lastmod,
            ts_upload        => $ts_upload,
+            context          => $ot2ut_context,
            error_code       => 'ok',
            error_cnt        => 0,
            utheses_id       => $utheses_id,
@@ -2631,6 +2641,7 @@ old format 2019-11..2020-01
            eprint_id        => $eprint_id,
            lastmod          => $lastmod,
            ts_upload        => $ts_upload,
+            context          => $ot2ut_context,
            error_code       => 'ok',
            error_cnt        => 0,
            utheses_id       => $utheses_id,
@@ -2639,6 +2650,17 @@ old format 2019-11..2020-01

          push (@synced, $out_row);
          $col_sync->insert($out_row);
+
+          my $td_start= time()-$t_start;
+          my $td_curl= time()-$t_curl;
+          my $msg=
+          {
+            message => "upload success: eprint_id=[$eprint_id] lastmod=[$lastmod] context=[$ot2ut_context] utheses_id=[$utheses_id] time_total=$td_start time_upload=$td_curl",
+            priority => 'normal',
+            state => 'new',
+            to => 'oma'
+          };
+          $col_msg->insert($msg);
        }

        sleep(5);
@@ -2687,6 +2709,9 @@ sub generate_utheses_metadata

  my $row= $all_rows->{$eprintid};

+  my $history= get_history($epr_db, $eprintid);
+  # print __LINE__, " history: ", Dumper($history); exit;
+
  my ($lang_pdf, $files)= analyze_files(map { $row->{$_} } qw(fileinfo dir));
  print __LINE__, " lang_pdf=[$lang_pdf] files: ", Dumper($files);
  my $main_file;
@@ -2731,8 +2756,7 @@ sub generate_utheses_metadata
  my $utp= $ut->{public};

  $utp->{origin}= 'import';
-  $utp->{datamodel}= 'container';
-  $utp->{uploaded_by}= 'ot2ut';
+  # not needed/wanted 2020-05-14: $utp->{datamodel}= 'container';
  $utp->{rights_statement}= 'http://rightsstatements.org/vocab/InC/1.0/';  # "In Copyright" or "Alle Rechte vorbehalten"

  if (defined ($row->{matr}))
@@ -2795,7 +2819,12 @@ sub generate_utheses_metadata
  $utp->{utheses_status}= ($row->{eprint_status} eq 'archive')
                          ? 'published'
                          : 'work_in_progress';  # objects in eprint_status "buffer" are 'work_in_progress';
-  $utp->{utheses_status_last_modified}= get_othes_timestamp($row, 'status_changed');
+
+# $utp->{utheses_status_last_modified}=  get_othes_timestamp($row, 'status_changed');
+# $utp->{phaidra_thesis_doc_added_date}= get_othes_timestamp($history->{create}, 'timestamp');
+  $utp->{othes}->{history_create}=       get_othes_timestamp($history->{create}, 'timestamp');
+  $utp->{utheses_status_last_modified}=  get_othes_timestamp($history->{move_buffer_to_archive}, 'timestamp');
+  $utp->{last_modified}=                 get_othes_timestamp($row, 'lastmod');

  $utp->{import}= # stored verbatim in mysql table utheses_import in column import_info
  {
@@ -2862,6 +2891,7 @@ sub generate_utheses_metadata
  my ($errors2, $classifications)= $epr->get_classifications($eprintid);
  push (@errors, @$errors2) if (@$errors2);

+  # 2020-05-14 nd: not needed: $thesis->{uploaded_by}= 'ot2ut';
  $thesis->{subject_classifications}= $classifications;
  $thesis->{number_of_pages}= "$main_file->{page_count}";  # Phaidra expects this as a string

@@ -2869,7 +2899,8 @@ sub generate_utheses_metadata

  # Mon May 11 22:12:38 CEST 2020 asked nd about this, especially thesis_doc_added_date:
  my %phaidra= map { $_ => '' } qw(container_pid container_status container_created_date thesis_doc_pid thesis_doc_status);
-  $phaidra{thesis_doc_added_date}= get_othes_timestamp($row, 'datestamp');
+  $phaidra{thesis_doc_added_date}= get_othes_timestamp($history->{create}, 'timestamp');
+# $phaidra{thesis_doc_added_date}= get_othes_timestamp($row, 'datestamp');

  $ut->public('phaidra', \%phaidra);

@@ -2878,6 +2909,43 @@ sub generate_utheses_metadata
  (\@errors, $row, $lastmod, $ut, $utheses_json_path, $files, $utheses_upload_result_json_path);
 }

+sub get_history
+{
+  my $epr_db= shift;
+  my $eprintid= shift;
+
+  my $history_rows= $epr_db->get_all_x('history', ['objectid=?', $eprintid]);
+  # print __LINE__, " history_rows: ", Dumper($history_rows);
+
+  my %historyids;
+  my ($create, $move_buffer_to_archive);
+  foreach my $historyid (keys %$history_rows)
+  {
+    my $row= $history_rows->{$historyid};
+    # print __LINE__, " history_row: ", Dumper($row);
+
+    # NOTE: a revision can be present multiple times, so we need to sort by historyid
+    # $revisions{$row->{revision}}= $row;
+    $historyids{$historyid}= $row;
+
+    $create=                 $row if ($row->{action} eq 'create'                 && !defined ($create));
+    $move_buffer_to_archive= $row if ($row->{action} eq 'move_buffer_to_archive' && !defined ($move_buffer_to_archive));
+  }
+  my @historyids= sort { $a <=> $b } keys %historyids;
+  print __LINE__, " historyids: ", join(' ', @historyids), "\n";
+
+  my @events= map { $historyids{$_} } @historyids;
+
+  my $history=
+  {
+    events => \@events,
+    create => $create,
+    move_buffer_to_archive => $move_buffer_to_archive,
+  };
+
+  $history;
+}
+
 sub get_study_id
 {
  my $matr= shift;
@@ -2963,11 +3031,16 @@ sub get_thesis_data
  push (@j_titles, { type => 'parallel', title_lang => $lang, title_text => $title,        origin => 'title'        }) if ($title);
  push (@j_titles, { type => 'parallel', title_lang => 'deu', title_text => $title_ger,    origin => 'title_ger'    }) if ($title_ger);
  push (@j_titles, { type => 'parallel', title_lang => 'eng', title_text => $title_eng,    origin => 'title_eng'    }) if ($title_eng);
-  push (@j_titles, { type => 'parallel', title_lang => $lang, title_text => $title_zusatz, origin => 'title_zusatz' }) if ($title_zusatz);
-  @j_titles[0]->{type}= 'main';
+# push (@j_titles, { type => 'parallel', title_lang => $lang, title_text => $title_zusatz, origin => 'title_zusatz' }) if ($title_zusatz);
+  $j_titles[0]->{type}= 'main';
+  if ($title_zusatz)
+  {
+    $j_titles[0]->{subtitle_text}= $title_zusatz;
+    $j_titles[0]->{subtitle_lang}= $lang;
+  };

  my @j_abstracts;
-  push (@j_abstracts, { language => $lang, text => $abstract,     origin => 'abstract'     }) if ($abstract);
+  push (@j_abstracts, { language => 'deu', text => $abstract,     origin => 'abstract'     }) if ($abstract);
  push (@j_abstracts, { language => 'eng', text => $abstract_eng, origin => 'abstract_eng' }) if ($abstract_eng);

  my @keywords=     split(/\s*\/\s*/, $keywords);
@@ -2994,7 +3067,7 @@ sub get_thesis_data
  (
    languages => [ $lang ],
    titles => \@j_titles,
-    abstract => \@j_abstracts,
+    abstracts => \@j_abstracts,
    keywords => \@j_keywords,

    type => $map_ot2ut_thesis_type{$row->{thesis_type}},
@@ -3210,7 +3283,7 @@ sub debug_keywords
  my $epr_db= $epr->connect();
  my @col_names_db= qw( eprintid eprint_status sprache keywords keywords_eng );

-  my $search_term= "eprint_status in ('archive', 'buffer') and sprache<>'ger' and sprache<>'eng'";
+# my $search_term= "eprint_status in ('archive', 'buffer') and sprache<>'ger' and sprache<>'eng'";
  my $search_term= "eprint_status in ('archive', 'buffer')";

  my $keys= $epr_db->get_all_x('eprint', [$search_term], join(',', @col_names_db));
@@ -3273,6 +3346,38 @@ sub debug_keywords
  Util::JSON::write_json_file('/backup/othes/eprints/test/othes_keywords.json', \%all_othes);
 }

+sub debug_abstracts
+{
+  my $epr= get_eprints_db($cnf);
+
+  my $epr_db= $epr->connect();
+  my @col_names_db= qw( eprintid eprint_status sprache abstract abstract_eng );
+
+# my $search_term= "eprint_status in ('archive', 'buffer') and sprache<>'ger' and sprache<>'eng'";
+# my $search_term= "eprint_status in ('archive', 'buffer') and sprache='ger'";
+# my $search_term= "eprint_status in ('archive', 'buffer') and sprache='eng'";
+# my $search_term= "eprint_status in ('archive', 'buffer')";
+  my $search_term= "eprintid in (2276, 3432, 8314, 9358, 10236, 10941, 15148, 15934, 18224, 23898, 27575, 28791, 30614, 32692, 35111, 38069, 40982, 42122, 43078, 44504, 44510, 46380, 46381, 49927, 51776, 52780, 52925, 56916, 60835)";
+
+  my $keys= $epr_db->get_all_x('eprint', [$search_term], join(',', @col_names_db));
+
+  open (FO, '>:utf8', 'all_keywords.tsv') or die;
+  # print FO join("\t", qw( eprintid eprint_status lang n_kw kw n_kwe kwe )), "\n";
+  print FO join("\t", qw( eprintid eprint_status lang lang_kw n_kw kw )), "\n";
+
+  my (%all_keywords_de, %all_keywords_en);
+  my %all_othes;
+  foreach my $key (keys %$keys)
+  {
+    my $r= $keys->{$key};
+    print __LINE__, " key=[$key] ", Dumper($r);
+    my ($id, $es, $lang, $abs, $abse)= map { $r->{$_} } @col_names_db;
+    # $abs =~ tr/ \t\r\n/ /s;
+    # print join("\t", $id, $abs), "\n";
+  }
+
+}
+
 sub debug_stkz
 {
  my $epr= get_eprints_db($cnf);

--- a/lib/IRMA/eprints.pm
+++ b/lib/IRMA/eprints.pm
@@ -43,7 +43,7 @@ sub fetch_data
    $conditions .= ' AND doi IS NULL'     if ($c eq 'doi' && !$other_conditions->{doi});
  }

-  $m->show_query(1);
+  # $m->show_query(1);
  my $res= $m->get_all_x ('eprint', [$conditions, $eprint_status],
                          'eprintid,eprint_status,ac_nummer,type,matr,urn,uri,sperre,einverstaendnis,rev_number'
                         );