From 37d62ee190fbcce26d4ee573a9ee497625484437 Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Mon, 26 Sep 2022 14:32:22 +0200
Subject: [PATCH] additional functionality for ut1.pl

---
 lib/DataCite/API.pm       |   6 +-
 lib/Univie/Utheses/API.pm |  58 ++--
 ut1.pl                    | 620 +++++++++++++++++++++++++++++++++-----
 3 files changed, 591 insertions(+), 93 deletions(-)

diff --git a/lib/DataCite/API.pm b/lib/DataCite/API.pm
index 5601f17..ce759dd 100644
--- a/lib/DataCite/API.pm
+++ b/lib/DataCite/API.pm
@@ -52,7 +52,7 @@ sub register_doi
   unless ($code1 =~ m#^20[01]#)
   {
     print STDERR "ATTN: register_doi POST metadata returned code1=[$code1] res1=[$res1]\n";
-    return undef;
+    return (0, $res1, undef);
   }
 
   my $doi_reg= <<"EOX";
@@ -66,10 +66,10 @@ EOX
   unless ($code2 =~ m#^20[01]#)
   {
     print STDERR "ATTN: register_doi POST doi returned code2=[$code2] res2=[$res2]\n";
-    return undef;
+    return (0, $res1, $res2);
   }
 
-  1;
+  (1, $res1, $res2);
 }
 
 sub datacite_request
diff --git a/lib/Univie/Utheses/API.pm b/lib/Univie/Utheses/API.pm
index 1b2e896..11f01a1 100644
--- a/lib/Univie/Utheses/API.pm
+++ b/lib/Univie/Utheses/API.pm
@@ -20,45 +20,65 @@ sub getContainerPublicMetadata
   my $self= shift;
   my $utheses_id= shift;
 
-  my ($code1, $res1)= $self->utheses_request('GET', 'container/get/public', $utheses_id);
-  print __LINE__, " code1=[$code1] res1=[$res1]\n";
-
-  my $info;
-  if ($code1 =~ m#^2#)
-  {
-    eval
-    {
-      $info= from_json($res1);
-    };
-    if ($@)
-    {
-      die $@;
-    }
-  }
+  my ($code1, $res1, $info)= $self->utheses_request('GET', 'container/get/public', $utheses_id);
+  print __LINE__, " code1=[$code1] res1=[$res1] info=[$info]\n";
 
   # TODO: Error handling!
   $info;
 }
 
+sub getPendingDoisCreateRequest
+{
+  my $self= shift;
+
+  my ($code1, $res1, $info)= $self->utheses_request('GET', 'doi/get/createRequest', undef, );
+  print __LINE__, " code1=[$code1] res1=[$res1] info=[$info]\n";
+
+  # TODO: Error handling!
+  (wantarray) ? ($code1, $res1, $info) : $info;
+}
+
 sub utheses_request
 {
   my $self=   shift;
   my $method= shift;
   my $what=   shift;
-  my $id=     shift;
+  my $par=    shift;
 
-  my ($api_url)= map { $self->{config}->{$_} } qw(api_url);
+  my ($api_url, $headers)= map { $self->{config}->{$_} } qw(api_url headers);
 
-  my $req_url= join ('/', $api_url, $what, $id);
+  my $req_url= join ('/', $api_url, $what, $par);
   print __LINE__, " req_url=[$req_url]\n";
   my $req = HTTP::Request->new( $method => $req_url );
+  if (defined ($headers))
+  {
+    foreach my $h (@$headers)
+    {
+      $req->header(@$h);
+    }
+  }
 
   my $ua= LWP::UserAgent->new;
   my $res= $ua->request($req);
 
   # print __LINE__, " res: ", main::Dumper($res);
   my $txt= decode("utf8", $res->content());
-  return ($res->code(), $txt);
+  my $code= $res->code();
+
+  my $info;
+  if ($code =~ m#^2#)
+  {
+    eval
+    {
+      $info= from_json($txt);
+    };
+    if ($@)
+    {
+      die $@; # TODO: this should be handled mor gracefully!
+    }
+  }
+
+  return ($code, $txt, $info);
 }
 
 1;
diff --git a/ut1.pl b/ut1.pl
index 71d6154..4cf8654 100755
--- a/ut1.pl
+++ b/ut1.pl
@@ -3,6 +3,7 @@
 use strict;
 
 use FileHandle;
+use utf8;
 
 binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
 binmode( STDERR, ':utf8' ); autoflush STDERR 1;
@@ -11,21 +12,36 @@ binmode( STDIN,  ':utf8' );
 use Data::Dumper;
 $Data::Dumper::Indent= 1;
 
+use Digest::MD5::File qw(file_md5_hex);
+
+use Util::ts;
+use Util::JSON;
+use Redmine::DB::MySQL;
+
+use Phaidra::Utils::iso639;
+
 use lib 'lib';
 use Univie::Utheses::API;
-use Phaidra::Utils::iso639;
-use Util::JSON;
 use DataCite::API;
+use IRMA::NA;
+use Alma::MARC_Extractor;
 
 my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors );
 
 # my $op_mode= 'fetch_metadata_bulk';
-my $op_mode= 'analyze';
+my $op_mode= 'process';
 my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp!
 
-my $config_file= '/etc/irma/DataCite.json';
+# TODO: make these configurable too!
+my $agent_config_file= '/etc/irma/pidagent.json';
+
+my $MAX_MARC_AGE= 86400*60;
+my $MAX_MARC_REQUESTS= 10_000;
 
 my $do_register_doi= 0;
+my $agent_name= 'pidagent';
+my $agent_id= $$;
+my $fix_problems= 0;
 
 my @pars= ();
 while (my $arg= shift (@ARGV))
@@ -36,6 +52,7 @@ while (my $arg= shift (@ARGV))
     my ($opt, $val)= split ('=', $1, 2);
        if ($opt eq 'help') { usage(); }
     elsif ($opt eq 'register-doi') { $do_register_doi= 1; }
+    elsif ($opt eq 'fix') { $fix_problems= 1; }
     else { usage(); }
   }
   elsif ($arg =~ /^-(.+)/)
@@ -53,25 +70,84 @@ while (my $arg= shift (@ARGV))
   }
 }
 
-my $cnf= Util::JSON::read_json_file ($config_file);
-# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0);
-my $ut_cnf= $cnf->{repositories}->{'utheses.univie.ac.at'};
+# load configuration
+my $agent_cnf= Util::JSON::read_json_file ($agent_config_file);
+# print __LINE__, " agent_cnf: ", main::Dumper ($agent_cnf); exit(0);
+my $dc_cnf= Util::JSON::read_json_file ($agent_cnf->{DataCite_config});
+# print __LINE__, " dc_cnf: ", main::Dumper ($dc_cnf); exit(0);
+my $ut_cnf= $dc_cnf->{repositories}->{'utheses.univie.ac.at'};
 # print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0);
-my $reg_cnf= $cnf->{doi_registries}->{$ut_cnf->{registry}};
-# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0);
+my $dc_reg_cnf= $dc_cnf->{doi_registries}->{my $dc_cnf_name= $ut_cnf->{registry}};
+# print __LINE__, " dc_reg_cnf: ", main::Dumper ($dc_reg_cnf); exit(0);
+
+# prepare APIs
+my $ut_api= new Univie::Utheses::API( config => $ut_cnf->{api_config} );
+die "no ut_api" unless (defined ($ut_api));
 
-  my $utapi= new Univie::Utheses::API( config => { api_url => $ut_cnf->{api_url} } );
-  die "no utapi" unless (defined ($utapi));
+my $reg_obj= new DataCite::API (config => $dc_reg_cnf, xmode => 'test');
+die "no reg_obj" unless (defined ($reg_obj));
 
-  my $reg_obj= new DataCite::API (config => $reg_cnf, xmode => 'test');
-  die "no reg_obj" unless (defined ($reg_obj));
+# get handles for various databases
+my $marc_db=  IRMA::db::get_any_db($agent_cnf, 'marc_database');
+my $agent_db= IRMA::db::get_any_db($agent_cnf, 'pidagent_database');
+print __LINE__, " agent_db=[$agent_db]\n";
 
-if ($op_mode eq 'analyze')
+my @marc_fields= qw(ac_number mms_id fetched lib_code);
+my $mex= Alma::MARC_Extractor->new(\@marc_fields);
+
+my $running= 0;
+if ($op_mode eq 'process')
 {
-  foreach my $par (@pars)
+  do
   {
-    if ($par =~ m#^\d+$#) { analyze_utheses_item($par); }
-  }
+    while (my $par= shift (@pars))
+    {
+      if ($par =~ m#^(\d+)$#
+          || $par =~ m#https?://utheses.univie.ac.at/detail/(\d+)[\#/]?#
+         )
+      {
+        my $utheses_id= $1;
+        my @actions= process_utheses_item($utheses_id);
+        print __LINE__, " process_utheses_item($utheses_id) ==> [", join(', ', @actions), "]\n";
+      }
+      elsif ($par =~ m#^(\d+)\.\.(\d+)$# || $par =~ m#^(\d+)\-(\d+)$# || $par =~ m#(blk)(\d+)$#)
+      {
+        my ($start, $end)= ($1, $2);
+        ($start, $end)= ($end*100, $end*100+99) if ($start eq 'blk');
+        my $item_count= $end-$start;
+        if ($start <= $end && (($item_count <= 2000 && $do_register_doi) || ($item_count <= 5000 && !$do_register_doi)))
+        {
+          foreach (my $utheses_id= $start; $utheses_id <= $end; $utheses_id++)
+          {
+            push (@pars, "$utheses_id"); # cast to string!
+          }
+        }
+      }
+      elsif ($par =~ m#^AC\d{8}$#)
+      {
+        my @actions= process_ac_number($par);
+        print __LINE__, " process_ac_number($par) ==> [", join(', ', @actions), "]\n";
+      }
+      elsif ($par eq 'queue')
+      {
+        my @utheses_ids= get_job_from_queue();
+        print __LINE__, ' queued utheses_ids: ', join(', ', @utheses_ids), "\n";
+        sleep(10);
+        push (@pars, @utheses_ids, 'cleanup');
+      }
+      elsif ($par eq 'cleanup')
+      {
+        cleanup_queue();
+      }
+      elsif ($par eq 'gpdcr')
+      {
+        my @utheses_ids= gpdcr();
+        # push (@pars, @utheses_ids);
+      }
+
+      sleep(2) if (@pars);
+    }
+  } while($running);
 }
 elsif ($op_mode eq 'fetch_metadata_bulk')
 {
@@ -79,55 +155,372 @@ elsif ($op_mode eq 'fetch_metadata_bulk')
 }
 exit(0);
 
-sub analyze_utheses_item
+sub gpdcr
+{
+  my ($status, $txt, $info)= $ut_api->getPendingDoisCreateRequest();
+  print __LINE__, " gpdcr: info: ", Dumper($info);
+
+  my @utheses_ids= ();
+  if ($status eq '200')
+  {
+    my $p= $info->{pendingDois};
+    if (defined ($p) && ref($p) eq 'HASH')
+    {
+      push (@utheses_ids, map { $p->{$_}->{utheses_id} } keys %$p);
+    }
+    else
+    {
+      die "unexpected type of pendingDois";
+    }
+  }
+
+  @utheses_ids;
+}
+
+sub cleanup_queue
+{
+  my $q_col= $agent_db->get_collection('queue');
+
+  my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name, agent_id => $agent_id});
+  if (defined ($j))
+  {
+    print __LINE__, " finishing old job ", Dumper($j);
+    $q_col->update({_id => $j->{_id}}, { '$set' => {status => "done", agent_name => $agent_name, agent_id => $agent_id }});
+    sleep(10);
+  }
+}
+
+sub get_job_from_queue
+{
+  my $q_col= $agent_db->get_collection('queue');
+
+  my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name});
+  if (defined ($j))
+  {
+    print __LINE__, " resuming old job ", Dumper($j);
+  }
+  else
+  {
+    $j= $q_col->find_one({status => "new"});
+  }
+  return () unless (defined ($j));
+  print __LINE__, " found new job ", Dumper($j);
+
+  $q_col->update({_id => $j->{_id}}, { '$set' => {status => "in_progress", agent_name => $agent_name, agent_id => $agent_id }});
+
+  my @ut_ids;
+  if (exists($j->{utheses_ids})) { push (@ut_ids, @{$j->{utheses_ids}}); }
+  if (exists($j->{utheses_id}))  { push (@ut_ids,   $j->{utheses_id}); }
+
+  return @ut_ids;
+}
+
+sub process_utheses_item
 {
   my $utheses_id= shift;
 
-  my ($row, $xml)= get_utheses_metadata($utheses_id);
+  print __LINE__, " process_utheses_item: utheses_id=[$utheses_id] ", '='x50, "\n";
+
+  my $register_doi_ok= 1;
+  my $row= {};
+  my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id);
 
-  print __LINE__, " utheses_id=[$utheses_id] ", '='x50, "\n";
+  print __LINE__, " error_code=[$error_code] status=[$status]\n";
   print __LINE__, " xml=[$xml]\n";
   print __LINE__, " row: ", Dumper($row);
 
-  if ($do_register_doi)
+  my @actions= ('fetched_utheses');
+  unless ($status eq '200')
+  {
+    push (@actions, 'no_utheses_record');
+    return @actions;
+  }
+
+  unless ($error_code eq 'ok')
+  {
+    push (@actions, "error_code=[$error_code]");
+    report_problem( { area => 'utheses', problem => 'error_code', utheses_id => $utheses_id, error_code => $error_code } );
+    $register_doi_ok= 0;
+  }
+
+  my @x= analyze_marc_record($row);
+  push (@actions, @x);
+
+  unless (defined ($agent_db))
+  {
+    return @actions;
+  }
+
+  my $ut_col= $agent_db->get_collection('utheses');
+
+  if ($fix_problems) { remove_problem('utheses', $row->{utheses_id}); }
+  # first: compare existing utheses record (if it exists) with data from utheses
+  my $ut_data= $ut_col->find_one( { utheses_id => $row->{utheses_id} } );
+  if (defined ($ut_data))
+  {
+    my @cmp_fields= qw(doi urn nbn ac_number);
+    my @problems= ();
+    foreach my $f (@cmp_fields)
+    {
+      if (exists($ut_data->{$f}) && defined($ut_data->{$f}) && $row->{$f} ne $ut_data->{$f})
+      {
+        push (@problems, { problem => 'missmatch', field => $f, recorded => $ut_data->{$f}, found => $row->{$f} });
+      }
+    }
+
+    if (@problems)
+    {
+      report_problem( { area => 'utheses', problem => 'utheses_data_missmatch', utheses_id => $utheses_id, problems => \@problems } );
+      push(@actions, 'problem report utheses');
+      return (@actions);
+    }
+  }
+
+  # second: update utheses record in database
+  my $res_upd= $ut_col->update( { utheses_id => $row->{utheses_id} }, $row, { upsert => 1 } );
+  print __LINE__, " res_upd: ", Dumper($res_upd);
+
+  my ($ok, $verdict, $msg1, $msg2);
+  if ($do_register_doi && $register_doi_ok)
   {
-    unless (defined ($row))
+    if ($fix_problems)
     {
-      print "ATTN: can't register DOI: no utheses data found\n";
-      goto END;
+      remove_problem('datacite', $row->{utheses_id}, 'utheses_id');
+      remove_problem('datacite', $row->{doi});
     }
 
-    unless (defined ($xml))
+    my $dc_col= $agent_db->get_collection('datacite');
+
+    my ($utheses_id, $doi, $xml_md5, $url)= map { $row->{$_} } qw(utheses_id doi xml_md5 persistent_link);
+    my $dc_record= $dc_col->find_one( { doi => $doi } );
+
+    # prepare registration data
+    my %reg_info=
+    (
+      doi          => $row->{doi},
+      url          => $url,
+      context      => 'utheses',
+      registry     => $dc_cnf_name,
+      utheses_id   => $utheses_id,
+      ac_number    => $row->{ac_number},
+      xml_md5      => $xml_md5,
+      ts_epoch     => time(),
+      ts_iso_gmt   => Util::ts::ts_ISO_gmt(),
+    );
+
+    # check existing datacite registration record, if it exists
+    if (defined ($dc_record))
     {
-      print "ATTN: can't register DOI: no xml data generated\n";
-      goto END;
+      print __LINE__, " datacite record found: ", Dumper($dc_record);
+
+      # check if record matches with new data
+      my @cmp_fields= qw(registry context url utheses_id ac_number);
+      my @problems= ();
+      foreach my $f (@cmp_fields)
+      {
+        if (exists($dc_record->{$f}) && defined($dc_record->{$f}))
+        {
+          if ($reg_info{$f} ne $dc_record->{$f})
+          {
+            push (@problems, { problem => 'missmatch', field => $f, recorded => $dc_record->{$f}, found => $reg_info{$f} });
+          }
+          # else: record matches, that's ok
+        }
+        else
+        {
+          push (@problems, { problem => 'missing', field => $f });
+        }
+      }
+
+      if (@problems)
+      {
+        report_problem( { area => 'datacite', problem => 'datacite_data_missmatch', utheses_id => $utheses_id, doi => $doi, problems => \@problems } );
+        push(@actions, 'problem report datacite');
+        return (@actions);
+      }
+
+      if ($dc_record->{xml_md5} eq $xml_md5 && $dc_record->{reg_status} == 1)
+      {
+        print __LINE__, " datacite metadata unchanged; not updating!\n";
+        push (@actions, 'datacite doi metadata unchanged');
+        return @actions;
+      }
     }
 
-    my ($doi, $url, $errors, $ftl)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors fulltext_locked);
-    if ($ftl)
+    ($ok, $verdict, $msg1, $msg2)= register_doi_with_DataCite_for_utheses($row, $xml);
+    print __LINE__, " ok=[$ok] msg1=[$msg1] msg2=[$msg2] verdict=[$verdict]\n";
+    push (@actions, (($ok) ? 'datacite doi registration ok' : 'datacite doi registration failed'), $verdict);
+
+    $reg_info{reg_status}= $ok;
+    $reg_info{reg_verdict}= $verdict,
+    $reg_info{reg_message1}= $msg1;
+    $reg_info{reg_message2}= $msg2;
+
+    print __LINE__, " insert into datacite; reg_info: ", Dumper(\%reg_info);
+    my $res= $dc_col->update( { doi => $doi }, \%reg_info, { upsert => 1 } );
+    print __LINE__, " insert res: ", Dumper($res);
+
+    unless ($ok)
     {
-      print "ATTN: can't register DOI: fulltext locked\n";
-      goto END;
+      report_problem( { area => 'datacite', problem => 'registration failure', utheses_id => $utheses_id, doi => $doi, reg_info => \%reg_info } );
     }
+  }
+
+  @actions;
+}
+
+sub process_ac_number
+{
+  my $ac_number= shift;
 
-    if (@$errors)
+  my $row= {};
+  my @actions= analyze_marc_record($row, $ac_number);
+
+  if (exists($row->{val_utheses}))
+  {
+    my $ut_link= $row->{val_utheses};
+    if ($ut_link =~ m#https?://utheses.univie.ac.at/detail/(\d+)[\#/]?#)
+    {
+      my $utheses_id= $1;
+      my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id);
+    }
+    else
     {
-      print "ATTN: can't register DOI $doi due to errors: ", join(', ', @$errors), "\n";
-      goto END;
+      # TODO: report bad link in Alma
     }
+  }
+
+  print __LINE__, " process_ac_number: ac_number=[$ac_number] row: ", Dumper($row);
+}
+
+sub analyze_marc_record
+{
+  my $row= shift;
+  my $ac_number= shift || $row->{ac_number};
+
+  return ($row->{marc_record}= 'no_marc_db')        unless (defined ($marc_db));
+  return ($row->{marc_record}= 'invalid_ac_number') unless ($ac_number =~ m#^AC\d{8}$#);
+
+  my @actions= ();
+  my $marc_col= $marc_db->get_collection('alma.marc');
+  my $marc_rec= $marc_col->find_one({ ac_number => $ac_number });
+  print __LINE__, " marc_rec: ", Dumper($marc_rec);
+  my $request_marc_rec= 0;
 
-    if ($reg_obj->register_doi ($doi, $xml, $url))
+  if (defined ($marc_rec))
+  {
+    my $marc_fetched= $marc_rec->{fetched};
+
+    my $best_before= $marc_fetched + $MAX_MARC_AGE;
+    my $now= time();
+    print __LINE__, " marc_fetched=[$marc_fetched] best_before=[$best_before] now=[$now]\n";
+
+    if ($best_before > $now)
     {
-      print "NOTE: register_doi doi=[$doi] url=[$url] OK\n";
+      $row->{marc_record}= 'ok';
     }
     else
     {
-      print "ATTN: register_doi doi=[$doi] url=[$url] was not ok\n";
+      print __LINE__, " marc_record too old\n";
+      $row->{marc_record}= 'too_old';
+      $request_marc_rec= 1;
     }
+
+    # check marc record, even when it is too old
+    $row->{marc}= my $x= {};
+    $mex->extract_identifiers($marc_rec, $x);
+    push (@actions, 'marc_rec_checked');
+  }
+  else
+  {
+    $row->{marc_record}= 'not_found';
+    $request_marc_rec= 1;
+  }
+
+  if ($request_marc_rec)
+  {
+    my $req_col= $marc_db->get_collection('requests');
+
+    my $req=
+    {
+      agent        => 'alma_cat',
+      status       => 'new',
+      action       => 'update_alma_2xml',
+      ac_number    => $ac_number,
+      requested_by => $agent_name,
+    };
+    $req_col->insert ($req);
+
+    push (@actions, 'marc_rec_requested');
+    print __LINE__, " analyze_marc_record: insert requests: ", Dumper($req);
+  }
+
+  @actions;
+}
+
+sub register_doi_with_DataCite_for_utheses
+{
+  my $row= shift;
+  my $xml= shift;
+
+  my ($rejected, $accepted);
+  my $ok= 0;
+
+  unless (defined ($row))
+  {
+    $rejected= "can't register DOI: no utheses data found";
+    goto END;
+  }
+
+  unless (defined ($xml))
+  {
+    $rejected= "can't register DOI: no xml data generated";
+    goto END;
+  }
+
+  my ($doi, $url, $errors, $pol)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors policies);
+  my ($ftl, $lks)= map { $pol->{$_} } qw(fulltext_locked lock_status);
+  if (!defined ($ftl) || $ftl)
+  {
+    $rejected= "can't register DOI: fulltext locked ($ftl)";
+    goto END;
+  }
+
+  if (!defined ($lks) || $lks)
+  {
+    $rejected= "can't register DOI: lock_status=[$lks]";
+    goto END;
+  }
+
+  if (@$errors)
+  {
+    $rejected= "can't register DOI $doi due to errors: ", join(', ', @$errors);
+    goto END;
+  }
+
+  my ($reg_res, $reg_msg1, $reg_msg2)= $reg_obj->register_doi ($doi, $xml, $url);
+  if ($reg_res)
+  {
+    $accepted= "register_doi doi=[$doi] url=[$url] OK";
+    $ok= 1;
+  }
+  else
+  {
+    $rejected= "register_doi doi=[$doi] url=[$url] was not ok";
   }
 
 END:
-  return;
+
+  if ($rejected)
+  {
+    print "REJECTED: ", $rejected, "\n";
+  }
+  else
+  {
+    print "ACCEPTED: ", $accepted, "\n";
+  }
+
+  return ($ok, ($rejected || $accepted), $reg_msg1, $reg_msg2);
 }
 
 sub fetch_metadata_bulk
@@ -142,54 +535,90 @@ sub fetch_metadata_bulk
   {
     next if ($utheses_id eq 'utheses_id'); # CSV column name...
 
-    my ($row, $xml)= get_utheses_metadata($utheses_id);
+    my $row;
+    my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id);
 
     if (defined ($row))
     {
       print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n";
-
     }
   }
 }
 
+=head2 my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id)
+
+return DataCite_XML document for a given utheses_id and fill in $row
+with important information
+
+=cut
+
 sub get_utheses_metadata
 {
+  my $row= shift;
   my $utheses_id= shift;
 
   print __LINE__, " utheses_id=[$utheses_id]\n";
-  my $info= $utapi->getContainerPublicMetadata($utheses_id);
+  my $info= $ut_api->getContainerPublicMetadata($utheses_id);
   print __LINE__, " info: ", Dumper($info);
 
-  my ($row, $xml);
-
-  if ($info->{status} eq '200')
+  my $xml;
+  my $status= $info->{status};
+  my $error_code= 'unknown';
+  if ($status eq '200')
   {
-    ($row, $xml)= utheses2datacite($info, $utheses_id);
+    $xml= utheses2datacite($row, $info, $utheses_id);
     # print __LINE__, " row: ", Dumper($row);
 
     if (defined ($row->{doi}))
     {
-      $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml';
-      # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
-
-      if (defined ($xml) && defined ($row->{xml_fnm}) && !@{$row->{datacite_conversion_errors}})
+      if (defined ($xml) && defined (my $xml_fnm= $row->{xml_fnm}))
       {
-        open (XML, '>:utf8', $row->{xml_fnm}) or die; # TODO: or do something else...
+        $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml';
+        # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
+        open (XML, '>:utf8', $xml_fnm) or die "can't write to $xml_fnm"; # TODO: or do something else...
         print XML $xml;
         close (XML);
+        my $md5= file_md5_hex($xml_fnm);
+        print __LINE__, " xml_fnm=[$xml_fnm] md5=[$md5]\n";
+        $row->{xml_md5}= $md5;
+      }
+
+      if ($row->{datacite_conversion_error_count} == 0)
+      {
+        $error_code= 'ok';
       }
+      else
+      {
+        printf ERRORS ("utheses_id=%d datacite conversion errors\n", $utheses_id);
+        $error_code= 'datacite_conversion_errors';
+      }
+    }
+    else
+    {
+      printf ERRORS ("utheses_id=%d no doi defined\n", $utheses_id);
+      $error_code= 'no_doi_defined';
     }
   }
   else
   {
-    printf ERRORS ("utheses_id=%d status=%s\n", $utheses_id, $info->{status});
+    printf ERRORS ("utheses_id=%d bad status=%s\n", $utheses_id, $status);
+    $error_code= 'bad_status';
   }
 
-  return ($row, $xml);
+  $row->{error_code}= $error_code;
+  return ($error_code, $status, $xml);
 }
 
+=head2 my $xml= utheses2datacite ($row, $info, $utheses_id)
+
+Transcribe utheses $info for a given utheses_id into xml and return that.
+$row is filled with information extracted from $info.
+
+=cut
+
 sub utheses2datacite
 {
+  my $row= shift;
   my $info= shift;
   my $utheses_id= shift;
 
@@ -204,6 +633,9 @@ sub utheses2datacite
   if ($doi =~ m#^10.25365/(thesis\.\d+)$#)
   {
     $suffix= $1;
+    # rewrite DOI, if necessary, e.g. for the test environment
+    $doi= join ('/', $ut_cnf->{use_prefix}, $suffix) if (exists($ut_cnf->{use_prefix}));
+    # $doi= join ('/', '10.493943', $suffix); # TEST: fake prefix
   }
   else
   {
@@ -215,8 +647,7 @@ sub utheses2datacite
   push (@datacite_conversion_errors, "bad_url=[$persistent_link]") unless ($persistent_link =~ m#^https://utheses.univie.ac.at/detail/\d+/?$#);
 
   my ($titles, $abstracts, $publication_date, $langs, $policies)= map { $th->{$_} } qw(titles abstracts publication_date languages policies);
-  my $fulltext_locked= $policies->{fulltext_locked};
-  
+
   my $publication_year;
   if ($publication_date =~ m#^(\d{4})-?#)
   {
@@ -249,6 +680,8 @@ sub utheses2datacite
   <creators>
 EOX
 
+  # $xml .= "  <urxn>bla</urxn>\n";  # TEST: check what happens with invalid XML
+
   foreach my $author (@$authors)
   {
     $xml .= << "EOX";
@@ -317,23 +750,24 @@ EOX
 </resource>
 EOX
 
-  my %row=
-  (
-    utheses_id => $utheses_id,
-    suffix => $suffix,
-    doi => $doi,
-    nbn => $nbn,
-    ac_number => $ac_number,
-    langs => join(',', @$langs),
-    language => $main_language,
-    persistent_link => $persistent_link,
-    datacite_conversion_errors => \@datacite_conversion_errors,
-    fulltext_locked => $fulltext_locked,
-    phaidra => $phaidra,
-    policies => $policies,
-  );
-
-  (\%row, $xml);
+  $row->{utheses_fetched}= time();
+  $row->{utheses_id}= $utheses_id;
+  $row->{suffix}= $suffix;
+  $row->{doi}= $doi;
+  $row->{nbn}= $nbn;
+  $row->{ac_number}= $ac_number;
+  $row->{langs}= join(',', @$langs);
+  $row->{language}= $main_language;
+  $row->{persistent_link }=$persistent_link;
+  $row->{datacite_conversion_errors}= \@datacite_conversion_errors;
+  $row->{datacite_conversion_error_count}= scalar @datacite_conversion_errors;
+  $row->{phaidra}= $phaidra;   # ... mapped, see below
+  $row->{policies}= $policies; # ... mapped, see below
+
+  # foreach my $an (qw(fulltext_locked lock_status lock_until_date)) { $row->{$an}= $policies->{$an} }
+  # foreach my $an (qw(container_pid container_status thesis_doc_pid thesis_doc_status)) { $row->{$an}= $phaidra->{$an} }
+
+  $xml;
 }
 
 sub xml_escape
@@ -347,3 +781,47 @@ sub xml_escape
   $s;
 }
 
+sub report_problem
+{
+  my $problem= shift;
+
+  my $ut_col= $agent_db->get_collection('problems');
+
+  $problem->{ts_iso_gmt}= Util::ts::ts_ISO_gmt();
+
+  my $area= $problem->{area};
+  my $check_id;
+     if ($area eq 'utheses')  { $check_id= 'utheses_id' }
+  elsif ($area eq 'marc')     { $check_id= 'ac_number' }
+  elsif ($area eq 'datacite') { $check_id= 'doi' }
+
+  $ut_col->update( { area => $area, $check_id => my $id= $problem->{$check_id} }, $problem, { upsert => 1 } ); # replace problem report, if one already exists
+
+  print __LINE__, " ATTN: problem reported for $check_id=$id: ", Dumper($problem);
+}
+
+sub remove_problem
+{
+  my $area= shift;
+  my $id= shift;
+  my $check_id= shift;
+
+  my $ut_col= $agent_db->get_collection('problems');
+
+  unless (defined ($check_id))
+  {
+       if ($area eq 'utheses')  { $check_id= 'utheses_id' }
+    elsif ($area eq 'marc')     { $check_id= 'ac_number' }
+    elsif ($area eq 'datacite') { $check_id= 'doi' }
+  }
+
+  $ut_col->remove({ area => $area, $check_id => $id });
+}
+
+__END__
+
+=head1 TODO
+
+* handle signals like SIGINT
+
+
-- 
GitLab