latest version of ut1.pl

c3d44902 · Gerhard Gonter · 3059a218 · c3d44902
Commit c3d44902 authored 2 years ago by Gerhard Gonter
--- a/ut1.pl
+++ b/ut1.pl
@@ -13,6 +13,8 @@ use Data::Dumper;
 $Data::Dumper::Indent= 1;
 use Digest::MD5::File qw(file_md5_hex);
+use Digest::MD5 qw(md5_hex);
+use Encode qw(encode_utf8);
 use Util::ts;
 use Util::JSON;
@@ -28,7 +30,6 @@ use Alma::MARC_Extractor;
 my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors );
-# my $op_mode= 'fetch_metadata_bulk';
 my $op_mode= 'process';
 my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp!
@@ -38,6 +39,8 @@ my $agent_config_file= '/etc/irma/pidagent.json';
 my $MAX_MARC_AGE= 86400*60;
 my $MAX_MARC_REQUESTS= 10_000;
+my %metrics;
 my $do_register_doi= 0;
 my $agent_name= 'pidagent';
 my $agent_id= $$;
@@ -53,6 +56,7 @@ while (my $arg= shift (@ARGV))
       if ($opt eq 'help') { usage(); }
    elsif ($opt eq 'register-doi') { $do_register_doi= 1; }
    elsif ($opt eq 'fix') { $fix_problems= 1; }
+    elsif ($opt eq 'fmb') { my $op_mode= 'fetch_metadata_bulk'; }
    else { usage(); }
  }
  elsif ($arg =~ /^-(.+)/)
@@ -89,12 +93,30 @@ die "no reg_obj" unless (defined ($reg_obj));
 # get handles for various databases
 my $marc_db=  IRMA::db::get_any_db($agent_cnf, 'marc_database');
-my $agent_db= IRMA::db::get_any_db($agent_cnf, 'pidagent_database');
+my $marc_col= $marc_db->get_collection('alma.marc');
-print __LINE__, " agent_db=[$agent_db]\n";
+my $req_col= $marc_db->get_collection('requests');
 my @marc_fields= qw(ac_number mms_id fetched lib_code);
 my $mex= Alma::MARC_Extractor->new(\@marc_fields);
+my $agent_db= IRMA::db::get_any_db($agent_cnf, 'pidagent_database');
+# print __LINE__, " agent_db=[$agent_db]\n";
+my $ut_col= $agent_db->get_collection('utheses');
+my $dc_col= $agent_db->get_collection('datacite');
+my $q_col= $agent_db->get_collection('queue');
+my $problems_col= $agent_db->get_collection('problems');
+my ($nx_metrics_temp, $nx_metrics_prom);
+my @REPORT;
+if (exists ($agent_cnf->{'collector-textfile-directory'})
+    && defined (my $p= $agent_cnf->{'collector-textfile-directory'})
+   )
+{
+  $nx_metrics_temp= join('/', $p, 'ut1.temp');
+  $nx_metrics_prom= join('/', $p, 'ut1.prom');
+}
 my $running= 0;
 if ($op_mode eq 'process')
 {
@@ -109,6 +131,7 @@ if ($op_mode eq 'process')
        my $utheses_id= $1;
        my @actions= process_utheses_item($utheses_id);
        print __LINE__, " process_utheses_item($utheses_id) ==> [", join(', ', @actions), "]\n";
+        push (@REPORT, join(' ', $utheses_id, @actions));
      }
      elsif ($par =~ m#^(\d+)\.\.(\d+)$# || $par =~ m#^(\d+)\-(\d+)$# || $par =~ m#(blk)(\d+)$#)
      {
@@ -141,8 +164,10 @@ if ($op_mode eq 'process')
      }
      elsif ($par eq 'gpdcr')
      {
-        my @utheses_ids= gpdcr();
+        my $utheses_ids1= gpdcr();
-        # push (@pars, @utheses_ids);
+        my $utheses_ids2= remove_registered_DOIs($utheses_ids1);
+        print __LINE__, " gpdcr utheses_ids2: ", join (' ', @$utheses_ids2), "\n";
+        push (@pars, @$utheses_ids2);
      }
      sleep(2) if (@pars);
@@ -153,17 +178,26 @@ elsif ($op_mode eq 'fetch_metadata_bulk')
 {
  fetch_metadata_bulk(\@pars);
 }
+print __LINE__, " metrics: ", Dumper(\%metrics);
+write_metrics(\%metrics);
+print __LINE__, " REPORT:\n";
+foreach my $l (@REPORT) { print $l, "\n"; }
 exit(0);
 sub gpdcr
 {
  my ($status, $txt, $info)= $ut_api->getPendingDoisCreateRequest();
-  print __LINE__, " gpdcr: info: ", Dumper($info);
+  # print __LINE__, " gpdcr: info: ", Dumper($info);
  my @utheses_ids= ();
  if ($status eq '200')
  {
    my $p= $info->{pendingDois};
+    my $fnm= 'Pending_DOI_Create_Requests_'. Util::ts::ts_ISO_gmt(). '.json';
+    Util::JSON::write_json_file($fnm, $info);
+    print __LINE__, " saved $fnm\n";
    if (defined ($p) && ref($p) eq 'HASH')
    {
      push (@utheses_ids, map { $p->{$_}->{utheses_id} } keys %$p);
@@ -174,13 +208,47 @@ sub gpdcr
    }
  }
-  @utheses_ids;
+  $metrics{ut1_gpdcr_count}= @utheses_ids;
+  (wantarray) ? @utheses_ids : \@utheses_ids;
 }
-sub cleanup_queue
+sub remove_registered_DOIs
 {
-  my $q_col= $agent_db->get_collection('queue');
+  my $utheses_id_list= shift;
+  print __LINE__, " remove_registered_DOIs()\n";
+  # this is not useful, because new IDs are not listed here yet
+  # my $cur= $dc_col->find({ utheses_id => { '$in' => $utheses_id_list }, reg_status => { '$ne' => 1} });
+  my @utheses_ids= ();
+  my @registered_ids= ();
+  my ($gpdcr_checked, $gpdcr_queued)= (0, 0);
+  foreach my $ut_id (@$utheses_id_list)
+  {
+    my $dc_info= $dc_col->find_one({ utheses_id => "$ut_id" });
+    $gpdcr_checked++;
+    if ($dc_info->{reg_status} == 1)
+    {
+      push (@registered_ids, $ut_id);
+      next;
+    }
+    print __LINE__, " ut_id=[$ut_id] ", Dumper($dc_info);
+    push (@utheses_ids, $ut_id);
+    $gpdcr_queued++;
+  }
+  print __LINE__, " registered ids: ", join(' ', @registered_ids), "\n";
+  $metrics{ut1_gpdcr_already_registered}= @registered_ids;
+  $metrics{ut1_gpdcr_checked}= $gpdcr_checked;
+  $metrics{ut1_gpdcr_queued}= $gpdcr_queued;
+  (wantarray) ? @utheses_ids : \@utheses_ids;
+}
+sub cleanup_queue
+{
  my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name, agent_id => $agent_id});
  if (defined ($j))
  {
@@ -192,8 +260,6 @@ sub cleanup_queue
 sub get_job_from_queue
 {
-  my $q_col= $agent_db->get_collection('queue');
  my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name});
  if (defined ($j))
  {
@@ -222,8 +288,13 @@ sub process_utheses_item
  print __LINE__, " process_utheses_item: utheses_id=[$utheses_id] ", '='x50, "\n";
  my $register_doi_ok= 1;
+  if ($fix_problems) { remove_problem('utheses', $utheses_id); }
+  # first: compare existing utheses record (if it exists) with data from utheses
+  my $ut_data= $ut_col->find_one( { utheses_id => $utheses_id } );
  my $row= {};
-  my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id);
+  my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id, $ut_data);
  print __LINE__, " error_code=[$error_code] status=[$status]\n";
  print __LINE__, " xml=[$xml]\n";
@@ -233,6 +304,7 @@ sub process_utheses_item
  unless ($status eq '200')
  {
    push (@actions, 'no_utheses_record');
+    $metrics{ut1_no_record}++;
    return @actions;
  }
@@ -241,6 +313,7 @@ sub process_utheses_item
    push (@actions, "error_code=[$error_code]");
    report_problem( { area => 'utheses', problem => 'error_code', utheses_id => $utheses_id, error_code => $error_code } );
    $register_doi_ok= 0;
+    $metrics{ut1_utheses_errors}++;
  }
  my @x= analyze_marc_record($row);
@@ -248,14 +321,10 @@ sub process_utheses_item
  unless (defined ($agent_db))
  {
+    $metrics{ut1_missing_agent_db}++;
    return @actions;
  }
-  my $ut_col= $agent_db->get_collection('utheses');
-  if ($fix_problems) { remove_problem('utheses', $row->{utheses_id}); }
-  # first: compare existing utheses record (if it exists) with data from utheses
-  my $ut_data= $ut_col->find_one( { utheses_id => $row->{utheses_id} } );
  if (defined ($ut_data))
  {
    my @cmp_fields= qw(doi urn nbn ac_number);
@@ -272,6 +341,7 @@ sub process_utheses_item
    {
      report_problem( { area => 'utheses', problem => 'utheses_data_missmatch', utheses_id => $utheses_id, problems => \@problems } );
      push(@actions, 'problem report utheses');
+      $metrics{ut1_utheses_data_missmatch}++;
      return (@actions);
    }
  }
@@ -289,8 +359,7 @@ sub process_utheses_item
      remove_problem('datacite', $row->{doi});
    }
-    my $dc_col= $agent_db->get_collection('datacite');
+    print __LINE__, " row: ", Dumper($row);
    my ($utheses_id, $doi, $xml_md5, $url)= map { $row->{$_} } qw(utheses_id doi xml_md5 persistent_link);
    my $dc_record= $dc_col->find_one( { doi => $doi } );
@@ -304,6 +373,7 @@ sub process_utheses_item
      utheses_id   => $utheses_id,
      ac_number    => $row->{ac_number},
      xml_md5      => $xml_md5,
+    # xml_fnm      => $row->{xml_fnm},
      ts_epoch     => time(),
      ts_iso_gmt   => Util::ts::ts_ISO_gmt(),
    );
@@ -336,6 +406,7 @@ sub process_utheses_item
      {
        report_problem( { area => 'datacite', problem => 'datacite_data_missmatch', utheses_id => $utheses_id, doi => $doi, problems => \@problems } );
        push(@actions, 'problem report datacite');
+        $metrics{ut1_datacite_data_missmatch}++;
        return (@actions);
      }
@@ -343,6 +414,7 @@ sub process_utheses_item
      {
        print __LINE__, " datacite metadata unchanged; not updating!\n";
        push (@actions, 'datacite doi metadata unchanged');
+        $metrics{ut1_datacite_doi_metadata_unchanged}++;
        return @actions;
      }
    }
@@ -360,9 +432,14 @@ sub process_utheses_item
    my $res= $dc_col->update( { doi => $doi }, \%reg_info, { upsert => 1 } );
    print __LINE__, " insert res: ", Dumper($res);
-    unless ($ok)
+    if ($ok)
+    {
+      $metrics{ut1_datacite_doi_registration_ok}++;
+    }
+    else
    {
      report_problem( { area => 'datacite', problem => 'registration failure', utheses_id => $utheses_id, doi => $doi, reg_info => \%reg_info } );
+      $metrics{ut1_datacite_doi_registration_failure}++;
    }
  }
@@ -383,10 +460,12 @@ sub process_ac_number
    {
      my $utheses_id= $1;
      my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id);
+      $metrics{ut1_alma_utheses_link_ok}++;
    }
    else
    {
      # TODO: report bad link in Alma
+      $metrics{ut1_alma_utheses_link_bad}++;
    }
  }
@@ -398,11 +477,18 @@ sub analyze_marc_record
  my $row= shift;
  my $ac_number= shift || $row->{ac_number};
-  return ($row->{marc_record}= 'no_marc_db')        unless (defined ($marc_db));
+  unless (defined ($marc_db))
-  return ($row->{marc_record}= 'invalid_ac_number') unless ($ac_number =~ m#^AC\d{8}$#);
+  {
+    return ($row->{marc_record}= 'no_marc_db');
+    $metrics{ut1_no_marc_db}++;
+  }
+  unless ($ac_number =~ m#^AC\d{8}$#)
+  {
+    return ($row->{marc_record}= 'invalid_ac_number');
+    $metrics{ut1_invalid_ac_number}++;
+  }
  my @actions= ();
-  my $marc_col= $marc_db->get_collection('alma.marc');
  my $marc_rec= $marc_col->find_one({ ac_number => $ac_number });
  print __LINE__, " marc_rec: ", Dumper($marc_rec);
  my $request_marc_rec= 0;
@@ -418,12 +504,14 @@ sub analyze_marc_record
    if ($best_before > $now)
    {
      $row->{marc_record}= 'ok';
+      $metrics{ut1_alma_marc_record_fresh}++;
    }
    else
    {
      print __LINE__, " marc_record too old\n";
      $row->{marc_record}= 'too_old';
      $request_marc_rec= 1;
+      $metrics{ut1_alma_marc_record_requested}++;
    }
    # check marc record, even when it is too old
@@ -435,12 +523,11 @@ sub analyze_marc_record
  {
    $row->{marc_record}= 'not_found';
    $request_marc_rec= 1;
+    $metrics{ut1_alma_marc_record_not_found}++;
  }
  if ($request_marc_rec)
  {
-    my $req_col= $marc_db->get_collection('requests');
    my $req=
    {
      agent        => 'alma_cat',
@@ -556,6 +643,7 @@ sub get_utheses_metadata
 {
  my $row= shift;
  my $utheses_id= shift;
+  my $ut_data= shift;
  print __LINE__, " utheses_id=[$utheses_id]\n";
  my $info= $ut_api->getContainerPublicMetadata($utheses_id);
@@ -567,20 +655,43 @@ sub get_utheses_metadata
  if ($status eq '200')
  {
    $xml= utheses2datacite($row, $info, $utheses_id);
-    # print __LINE__, " row: ", Dumper($row);
+    print __LINE__, " row: ", Dumper($row);
+    $row->{xml_md5}= my $md5_hex= md5_hex(encode_utf8($xml));
    if (defined ($row->{doi}))
    {
-      if (defined ($xml) && defined (my $xml_fnm= $row->{xml_fnm}))
+      if (defined ($xml))
      {
+        my $save_xml= 1;
+        # print __LINE__, " xml=[$xml]\n";
        $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml';
        # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n";
-        open (XML, '>:utf8', $xml_fnm) or die "can't write to $xml_fnm"; # TODO: or do something else...
-        print XML $xml;
+        print __LINE__, " ut_data=[$ut_data]\n";
-        close (XML);
+        if (defined ($ut_data))
-        my $md5= file_md5_hex($xml_fnm);
+        { # we have xml info in the database, check if they match
-        print __LINE__, " xml_fnm=[$xml_fnm] md5=[$md5]\n";
+          print __LINE__, " ut_data: ", Dumper($ut_data);
-        $row->{xml_md5}= $md5;
+          if (defined (my $xml_fnm= $ut_data->{xml_fnm}) && defined (my $xml_md5= $ut_data->{xml_md5}))
+          {
+            print __LINE__, " xml_fnm=[$xml_fnm] xml_md5=[$xml_md5] md5_hex=[$md5_hex]\n";
+            if ($md5_hex eq $xml_md5)
+            {
+              $save_xml= 0;
+              print __LINE__, " md5 unchanged, will not save it again; save_xml=0\n";
+            }
+          }
+        }
+        if ($save_xml)
+        {
+          print __LINE__, " save_xml=[$save_xml], saving DataCite XML to [$xml_fnm]\n";
+          open (XML, '>:utf8', $xml_fnm) or die "can't write to $xml_fnm"; # TODO: or do something else...
+          print XML $xml;
+          close (XML);
+          $row->{xml_md5}= my $md5= file_md5_hex($xml_fnm);
+          print __LINE__, " xml_fnm=[$xml_fnm] md5=[$md5]\n";
+        }
      }
      if ($row->{datacite_conversion_error_count} == 0)
@@ -684,9 +795,10 @@ EOX
  foreach my $author (@$authors)
  {
+    my ($family, $given)= map { xml_escape($_) } ($author->{family_name}, $author->{given_name});
    $xml .= << "EOX";
    <creator>
-      <creatorName>$author->{family_name}, $author->{given_name}</creatorName>
+      <creatorName>$family, $given</creatorName>
    </creator>
 EOX
  }
@@ -785,8 +897,6 @@ sub report_problem
 {
  my $problem= shift;
-  my $ut_col= $agent_db->get_collection('problems');
  $problem->{ts_iso_gmt}= Util::ts::ts_ISO_gmt();
  my $area= $problem->{area};
@@ -795,7 +905,7 @@ sub report_problem
  elsif ($area eq 'marc')     { $check_id= 'ac_number' }
  elsif ($area eq 'datacite') { $check_id= 'doi' }
-  $ut_col->update( { area => $area, $check_id => my $id= $problem->{$check_id} }, $problem, { upsert => 1 } ); # replace problem report, if one already exists
+  $problems_col->update( { area => $area, $check_id => my $id= $problem->{$check_id} }, $problem, { upsert => 1 } ); # replace problem report, if one already exists
  print __LINE__, " ATTN: problem reported for $check_id=$id: ", Dumper($problem);
 }
@@ -806,8 +916,6 @@ sub remove_problem
  my $id= shift;
  my $check_id= shift;
-  my $ut_col= $agent_db->get_collection('problems');
  unless (defined ($check_id))
  {
       if ($area eq 'utheses')  { $check_id= 'utheses_id' }
@@ -815,7 +923,34 @@ sub remove_problem
    elsif ($area eq 'datacite') { $check_id= 'doi' }
  }
-  $ut_col->remove({ area => $area, $check_id => $id });
+  $problems_col->remove({ area => $area, $check_id => $id });
+}
+sub write_metrics
+{
+  my $metrics= shift;
+  open (FO, '>:utf8', $nx_metrics_temp) or die "can't write to $nx_metrics_temp";
+  my $now= time();
+  print FO <<"EOX";
+# HELP ut1_last_run last time ut1 ran
+# TYPE ut1_last_run counter
+ut1_last_run $now
+EOX
+  foreach my $mn (sort keys %$metrics)
+  {
+    my $cnt= $metrics->{$mn};
+    print FO <<"EOX";
+# HELP $mn ut1 gauge
+# TYPE $mn gauge
+$mn $cnt
+EOX
+  }
+  close(FO);
+  rename($nx_metrics_temp, $nx_metrics_prom);
+  print __LINE__, " metrics written to $nx_metrics_prom\n";
 }
 __END__