From c3d449027aa5bd51f989c654e6eee39c3f992576 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Thu, 23 Mar 2023 18:07:35 +0100 Subject: [PATCH] latest version of ut1.pl --- ut1.pl | 215 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 175 insertions(+), 40 deletions(-) diff --git a/ut1.pl b/ut1.pl index 4cf8654..6a9abdc 100755 --- a/ut1.pl +++ b/ut1.pl @@ -13,6 +13,8 @@ use Data::Dumper; $Data::Dumper::Indent= 1; use Digest::MD5::File qw(file_md5_hex); +use Digest::MD5 qw(md5_hex); +use Encode qw(encode_utf8); use Util::ts; use Util::JSON; @@ -28,7 +30,6 @@ use Alma::MARC_Extractor; my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors ); -# my $op_mode= 'fetch_metadata_bulk'; my $op_mode= 'process'; my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp! @@ -38,6 +39,8 @@ my $agent_config_file= '/etc/irma/pidagent.json'; my $MAX_MARC_AGE= 86400*60; my $MAX_MARC_REQUESTS= 10_000; +my %metrics; + my $do_register_doi= 0; my $agent_name= 'pidagent'; my $agent_id= $$; @@ -53,6 +56,7 @@ while (my $arg= shift (@ARGV)) if ($opt eq 'help') { usage(); } elsif ($opt eq 'register-doi') { $do_register_doi= 1; } elsif ($opt eq 'fix') { $fix_problems= 1; } + elsif ($opt eq 'fmb') { my $op_mode= 'fetch_metadata_bulk'; } else { usage(); } } elsif ($arg =~ /^-(.+)/) @@ -89,12 +93,30 @@ die "no reg_obj" unless (defined ($reg_obj)); # get handles for various databases my $marc_db= IRMA::db::get_any_db($agent_cnf, 'marc_database'); -my $agent_db= IRMA::db::get_any_db($agent_cnf, 'pidagent_database'); -print __LINE__, " agent_db=[$agent_db]\n"; +my $marc_col= $marc_db->get_collection('alma.marc'); +my $req_col= $marc_db->get_collection('requests'); my @marc_fields= qw(ac_number mms_id fetched lib_code); my $mex= Alma::MARC_Extractor->new(\@marc_fields); +my $agent_db= IRMA::db::get_any_db($agent_cnf, 'pidagent_database'); +# print __LINE__, " agent_db=[$agent_db]\n"; +my $ut_col= $agent_db->get_collection('utheses'); +my $dc_col= $agent_db->get_collection('datacite'); +my $q_col= $agent_db->get_collection('queue'); +my $problems_col= $agent_db->get_collection('problems'); + +my ($nx_metrics_temp, $nx_metrics_prom); +my @REPORT; + +if (exists ($agent_cnf->{'collector-textfile-directory'}) + && defined (my $p= $agent_cnf->{'collector-textfile-directory'}) + ) +{ + $nx_metrics_temp= join('/', $p, 'ut1.temp'); + $nx_metrics_prom= join('/', $p, 'ut1.prom'); +} + my $running= 0; if ($op_mode eq 'process') { @@ -109,6 +131,7 @@ if ($op_mode eq 'process') my $utheses_id= $1; my @actions= process_utheses_item($utheses_id); print __LINE__, " process_utheses_item($utheses_id) ==> [", join(', ', @actions), "]\n"; + push (@REPORT, join(' ', $utheses_id, @actions)); } elsif ($par =~ m#^(\d+)\.\.(\d+)$# || $par =~ m#^(\d+)\-(\d+)$# || $par =~ m#(blk)(\d+)$#) { @@ -141,8 +164,10 @@ if ($op_mode eq 'process') } elsif ($par eq 'gpdcr') { - my @utheses_ids= gpdcr(); - # push (@pars, @utheses_ids); + my $utheses_ids1= gpdcr(); + my $utheses_ids2= remove_registered_DOIs($utheses_ids1); + print __LINE__, " gpdcr utheses_ids2: ", join (' ', @$utheses_ids2), "\n"; + push (@pars, @$utheses_ids2); } sleep(2) if (@pars); @@ -153,17 +178,26 @@ elsif ($op_mode eq 'fetch_metadata_bulk') { fetch_metadata_bulk(\@pars); } + +print __LINE__, " metrics: ", Dumper(\%metrics); +write_metrics(\%metrics); +print __LINE__, " REPORT:\n"; +foreach my $l (@REPORT) { print $l, "\n"; } + exit(0); sub gpdcr { my ($status, $txt, $info)= $ut_api->getPendingDoisCreateRequest(); - print __LINE__, " gpdcr: info: ", Dumper($info); + # print __LINE__, " gpdcr: info: ", Dumper($info); my @utheses_ids= (); if ($status eq '200') { my $p= $info->{pendingDois}; + my $fnm= 'Pending_DOI_Create_Requests_'. Util::ts::ts_ISO_gmt(). '.json'; + Util::JSON::write_json_file($fnm, $info); + print __LINE__, " saved $fnm\n"; if (defined ($p) && ref($p) eq 'HASH') { push (@utheses_ids, map { $p->{$_}->{utheses_id} } keys %$p); @@ -174,13 +208,47 @@ sub gpdcr } } - @utheses_ids; + $metrics{ut1_gpdcr_count}= @utheses_ids; + + (wantarray) ? @utheses_ids : \@utheses_ids; } -sub cleanup_queue +sub remove_registered_DOIs { - my $q_col= $agent_db->get_collection('queue'); + my $utheses_id_list= shift; + print __LINE__, " remove_registered_DOIs()\n"; + + # this is not useful, because new IDs are not listed here yet + # my $cur= $dc_col->find({ utheses_id => { '$in' => $utheses_id_list }, reg_status => { '$ne' => 1} }); + my @utheses_ids= (); + my @registered_ids= (); + my ($gpdcr_checked, $gpdcr_queued)= (0, 0); + foreach my $ut_id (@$utheses_id_list) + { + my $dc_info= $dc_col->find_one({ utheses_id => "$ut_id" }); + $gpdcr_checked++; + if ($dc_info->{reg_status} == 1) + { + push (@registered_ids, $ut_id); + next; + } + + print __LINE__, " ut_id=[$ut_id] ", Dumper($dc_info); + push (@utheses_ids, $ut_id); + $gpdcr_queued++; + } + + print __LINE__, " registered ids: ", join(' ', @registered_ids), "\n"; + $metrics{ut1_gpdcr_already_registered}= @registered_ids; + $metrics{ut1_gpdcr_checked}= $gpdcr_checked; + $metrics{ut1_gpdcr_queued}= $gpdcr_queued; + + (wantarray) ? @utheses_ids : \@utheses_ids; +} + +sub cleanup_queue +{ my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name, agent_id => $agent_id}); if (defined ($j)) { @@ -192,8 +260,6 @@ sub cleanup_queue sub get_job_from_queue { - my $q_col= $agent_db->get_collection('queue'); - my $j= $q_col->find_one({status => "in_progress", agent_name => $agent_name}); if (defined ($j)) { @@ -222,8 +288,13 @@ sub process_utheses_item print __LINE__, " process_utheses_item: utheses_id=[$utheses_id] ", '='x50, "\n"; my $register_doi_ok= 1; + + if ($fix_problems) { remove_problem('utheses', $utheses_id); } + # first: compare existing utheses record (if it exists) with data from utheses + my $ut_data= $ut_col->find_one( { utheses_id => $utheses_id } ); + my $row= {}; - my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id); + my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id, $ut_data); print __LINE__, " error_code=[$error_code] status=[$status]\n"; print __LINE__, " xml=[$xml]\n"; @@ -233,6 +304,7 @@ sub process_utheses_item unless ($status eq '200') { push (@actions, 'no_utheses_record'); + $metrics{ut1_no_record}++; return @actions; } @@ -241,6 +313,7 @@ sub process_utheses_item push (@actions, "error_code=[$error_code]"); report_problem( { area => 'utheses', problem => 'error_code', utheses_id => $utheses_id, error_code => $error_code } ); $register_doi_ok= 0; + $metrics{ut1_utheses_errors}++; } my @x= analyze_marc_record($row); @@ -248,14 +321,10 @@ sub process_utheses_item unless (defined ($agent_db)) { + $metrics{ut1_missing_agent_db}++; return @actions; } - my $ut_col= $agent_db->get_collection('utheses'); - - if ($fix_problems) { remove_problem('utheses', $row->{utheses_id}); } - # first: compare existing utheses record (if it exists) with data from utheses - my $ut_data= $ut_col->find_one( { utheses_id => $row->{utheses_id} } ); if (defined ($ut_data)) { my @cmp_fields= qw(doi urn nbn ac_number); @@ -272,6 +341,7 @@ sub process_utheses_item { report_problem( { area => 'utheses', problem => 'utheses_data_missmatch', utheses_id => $utheses_id, problems => \@problems } ); push(@actions, 'problem report utheses'); + $metrics{ut1_utheses_data_missmatch}++; return (@actions); } } @@ -289,8 +359,7 @@ sub process_utheses_item remove_problem('datacite', $row->{doi}); } - my $dc_col= $agent_db->get_collection('datacite'); - + print __LINE__, " row: ", Dumper($row); my ($utheses_id, $doi, $xml_md5, $url)= map { $row->{$_} } qw(utheses_id doi xml_md5 persistent_link); my $dc_record= $dc_col->find_one( { doi => $doi } ); @@ -304,6 +373,7 @@ sub process_utheses_item utheses_id => $utheses_id, ac_number => $row->{ac_number}, xml_md5 => $xml_md5, + # xml_fnm => $row->{xml_fnm}, ts_epoch => time(), ts_iso_gmt => Util::ts::ts_ISO_gmt(), ); @@ -336,6 +406,7 @@ sub process_utheses_item { report_problem( { area => 'datacite', problem => 'datacite_data_missmatch', utheses_id => $utheses_id, doi => $doi, problems => \@problems } ); push(@actions, 'problem report datacite'); + $metrics{ut1_datacite_data_missmatch}++; return (@actions); } @@ -343,6 +414,7 @@ sub process_utheses_item { print __LINE__, " datacite metadata unchanged; not updating!\n"; push (@actions, 'datacite doi metadata unchanged'); + $metrics{ut1_datacite_doi_metadata_unchanged}++; return @actions; } } @@ -360,9 +432,14 @@ sub process_utheses_item my $res= $dc_col->update( { doi => $doi }, \%reg_info, { upsert => 1 } ); print __LINE__, " insert res: ", Dumper($res); - unless ($ok) + if ($ok) + { + $metrics{ut1_datacite_doi_registration_ok}++; + } + else { report_problem( { area => 'datacite', problem => 'registration failure', utheses_id => $utheses_id, doi => $doi, reg_info => \%reg_info } ); + $metrics{ut1_datacite_doi_registration_failure}++; } } @@ -383,10 +460,12 @@ sub process_ac_number { my $utheses_id= $1; my ($error_code, $status, $xml)= get_utheses_metadata($row, $utheses_id); + $metrics{ut1_alma_utheses_link_ok}++; } else { # TODO: report bad link in Alma + $metrics{ut1_alma_utheses_link_bad}++; } } @@ -398,11 +477,18 @@ sub analyze_marc_record my $row= shift; my $ac_number= shift || $row->{ac_number}; - return ($row->{marc_record}= 'no_marc_db') unless (defined ($marc_db)); - return ($row->{marc_record}= 'invalid_ac_number') unless ($ac_number =~ m#^AC\d{8}$#); + unless (defined ($marc_db)) + { + return ($row->{marc_record}= 'no_marc_db'); + $metrics{ut1_no_marc_db}++; + } + unless ($ac_number =~ m#^AC\d{8}$#) + { + return ($row->{marc_record}= 'invalid_ac_number'); + $metrics{ut1_invalid_ac_number}++; + } my @actions= (); - my $marc_col= $marc_db->get_collection('alma.marc'); my $marc_rec= $marc_col->find_one({ ac_number => $ac_number }); print __LINE__, " marc_rec: ", Dumper($marc_rec); my $request_marc_rec= 0; @@ -418,12 +504,14 @@ sub analyze_marc_record if ($best_before > $now) { $row->{marc_record}= 'ok'; + $metrics{ut1_alma_marc_record_fresh}++; } else { print __LINE__, " marc_record too old\n"; $row->{marc_record}= 'too_old'; $request_marc_rec= 1; + $metrics{ut1_alma_marc_record_requested}++; } # check marc record, even when it is too old @@ -435,12 +523,11 @@ sub analyze_marc_record { $row->{marc_record}= 'not_found'; $request_marc_rec= 1; + $metrics{ut1_alma_marc_record_not_found}++; } if ($request_marc_rec) { - my $req_col= $marc_db->get_collection('requests'); - my $req= { agent => 'alma_cat', @@ -556,6 +643,7 @@ sub get_utheses_metadata { my $row= shift; my $utheses_id= shift; + my $ut_data= shift; print __LINE__, " utheses_id=[$utheses_id]\n"; my $info= $ut_api->getContainerPublicMetadata($utheses_id); @@ -567,20 +655,43 @@ sub get_utheses_metadata if ($status eq '200') { $xml= utheses2datacite($row, $info, $utheses_id); - # print __LINE__, " row: ", Dumper($row); + print __LINE__, " row: ", Dumper($row); + $row->{xml_md5}= my $md5_hex= md5_hex(encode_utf8($xml)); if (defined ($row->{doi})) { - if (defined ($xml) && defined (my $xml_fnm= $row->{xml_fnm})) + if (defined ($xml)) { + my $save_xml= 1; + # print __LINE__, " xml=[$xml]\n"; $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml'; # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n"; - open (XML, '>:utf8', $xml_fnm) or die "can't write to $xml_fnm"; # TODO: or do something else... - print XML $xml; - close (XML); - my $md5= file_md5_hex($xml_fnm); - print __LINE__, " xml_fnm=[$xml_fnm] md5=[$md5]\n"; - $row->{xml_md5}= $md5; + + print __LINE__, " ut_data=[$ut_data]\n"; + if (defined ($ut_data)) + { # we have xml info in the database, check if they match + print __LINE__, " ut_data: ", Dumper($ut_data); + if (defined (my $xml_fnm= $ut_data->{xml_fnm}) && defined (my $xml_md5= $ut_data->{xml_md5})) + { + print __LINE__, " xml_fnm=[$xml_fnm] xml_md5=[$xml_md5] md5_hex=[$md5_hex]\n"; + if ($md5_hex eq $xml_md5) + { + $save_xml= 0; + print __LINE__, " md5 unchanged, will not save it again; save_xml=0\n"; + } + } + } + + if ($save_xml) + { + print __LINE__, " save_xml=[$save_xml], saving DataCite XML to [$xml_fnm]\n"; + open (XML, '>:utf8', $xml_fnm) or die "can't write to $xml_fnm"; # TODO: or do something else... + print XML $xml; + close (XML); + + $row->{xml_md5}= my $md5= file_md5_hex($xml_fnm); + print __LINE__, " xml_fnm=[$xml_fnm] md5=[$md5]\n"; + } } if ($row->{datacite_conversion_error_count} == 0) @@ -684,9 +795,10 @@ EOX foreach my $author (@$authors) { + my ($family, $given)= map { xml_escape($_) } ($author->{family_name}, $author->{given_name}); $xml .= << "EOX"; <creator> - <creatorName>$author->{family_name}, $author->{given_name}</creatorName> + <creatorName>$family, $given</creatorName> </creator> EOX } @@ -785,8 +897,6 @@ sub report_problem { my $problem= shift; - my $ut_col= $agent_db->get_collection('problems'); - $problem->{ts_iso_gmt}= Util::ts::ts_ISO_gmt(); my $area= $problem->{area}; @@ -795,7 +905,7 @@ sub report_problem elsif ($area eq 'marc') { $check_id= 'ac_number' } elsif ($area eq 'datacite') { $check_id= 'doi' } - $ut_col->update( { area => $area, $check_id => my $id= $problem->{$check_id} }, $problem, { upsert => 1 } ); # replace problem report, if one already exists + $problems_col->update( { area => $area, $check_id => my $id= $problem->{$check_id} }, $problem, { upsert => 1 } ); # replace problem report, if one already exists print __LINE__, " ATTN: problem reported for $check_id=$id: ", Dumper($problem); } @@ -806,8 +916,6 @@ sub remove_problem my $id= shift; my $check_id= shift; - my $ut_col= $agent_db->get_collection('problems'); - unless (defined ($check_id)) { if ($area eq 'utheses') { $check_id= 'utheses_id' } @@ -815,7 +923,34 @@ sub remove_problem elsif ($area eq 'datacite') { $check_id= 'doi' } } - $ut_col->remove({ area => $area, $check_id => $id }); + $problems_col->remove({ area => $area, $check_id => $id }); +} + +sub write_metrics +{ + my $metrics= shift; + open (FO, '>:utf8', $nx_metrics_temp) or die "can't write to $nx_metrics_temp"; + my $now= time(); + print FO <<"EOX"; +# HELP ut1_last_run last time ut1 ran +# TYPE ut1_last_run counter +ut1_last_run $now +EOX + + foreach my $mn (sort keys %$metrics) + { + my $cnt= $metrics->{$mn}; + print FO <<"EOX"; +# HELP $mn ut1 gauge +# TYPE $mn gauge +$mn $cnt +EOX + } + + close(FO); + + rename($nx_metrics_temp, $nx_metrics_prom); + print __LINE__, " metrics written to $nx_metrics_prom\n"; } __END__ -- GitLab