diff --git a/ut1.pl b/ut1.pl index e8ee019a398589abbe5c539a3523e2965437f9ee..71d61549014b17a87361af477ac501a617de8c4f 100755 --- a/ut1.pl +++ b/ut1.pl @@ -14,19 +14,38 @@ $Data::Dumper::Indent= 1; use lib 'lib'; use Univie::Utheses::API; use Phaidra::Utils::iso639; +use Util::JSON; +use DataCite::API; -# my %LANGUAGES= ( 'deu' => 'de', 'eng' => 'en' ); not used, see Phaidra::Utils::iso639 +my @TSV_COLUMNS= qw( utheses_id fulltext_locked suffix doi nbn ac_number langs language persistent_link xml_fnm errors ); -my @TSV_COLUMNS= qw( utheses_id suffix doi nbn ac_number langs language persistent_link errors ); - -my $op_mode= 'fetch_metadata'; +# my $op_mode= 'fetch_metadata_bulk'; +my $op_mode= 'analyze'; my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp! +my $config_file= '/etc/irma/DataCite.json'; + +my $do_register_doi= 0; + my @pars= (); while (my $arg= shift (@ARGV)) { - if ($arg =~ /^-/) + if ($arg eq '--') { push (@pars, @ARGV); @ARGV= (); } + elsif ($arg =~ /^--(.+)/) { + my ($opt, $val)= split ('=', $1, 2); + if ($opt eq 'help') { usage(); } + elsif ($opt eq 'register-doi') { $do_register_doi= 1; } + else { usage(); } + } + elsif ($arg =~ /^-(.+)/) + { + foreach my $opt (split ('', $1)) + { + if ($opt eq 'h') { usage(); exit (0); } + # elsif ($opt eq 'x') { $x_flag= 1; } + else { usage("unknown option $arg"); } + } } else { @@ -34,55 +53,150 @@ while (my $arg= shift (@ARGV)) } } -if ($op_mode eq 'fetch_metadata') +my $cnf= Util::JSON::read_json_file ($config_file); +# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0); +my $ut_cnf= $cnf->{repositories}->{'utheses.univie.ac.at'}; +# print __LINE__, " ut_cnf: ", main::Dumper ($ut_cnf); exit(0); +my $reg_cnf= $cnf->{doi_registries}->{$ut_cnf->{registry}}; +# print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); exit(0); + + my $utapi= new Univie::Utheses::API( config => { api_url => $ut_cnf->{api_url} } ); + die "no utapi" unless (defined ($utapi)); + + my $reg_obj= new DataCite::API (config => $reg_cnf, xmode => 'test'); + die "no reg_obj" unless (defined ($reg_obj)); + +if ($op_mode eq 'analyze') +{ + foreach my $par (@pars) + { + if ($par =~ m#^\d+$#) { analyze_utheses_item($par); } + } +} +elsif ($op_mode eq 'fetch_metadata_bulk') { - fetch_metadata(\@pars); + fetch_metadata_bulk(\@pars); } exit(0); -sub fetch_metadata +sub analyze_utheses_item { - my $utheses_ids= shift; + my $utheses_id= shift; + + my ($row, $xml)= get_utheses_metadata($utheses_id); - my $utapi= new Univie::Utheses::API( config => { api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at' } ); + print __LINE__, " utheses_id=[$utheses_id] ", '='x50, "\n"; + print __LINE__, " xml=[$xml]\n"; + print __LINE__, " row: ", Dumper($row); + + if ($do_register_doi) + { + unless (defined ($row)) + { + print "ATTN: can't register DOI: no utheses data found\n"; + goto END; + } + + unless (defined ($xml)) + { + print "ATTN: can't register DOI: no xml data generated\n"; + goto END; + } + + my ($doi, $url, $errors, $ftl)= map { $row->{$_} } qw(doi persistent_link datacite_conversion_errors fulltext_locked); + if ($ftl) + { + print "ATTN: can't register DOI: fulltext locked\n"; + goto END; + } + + if (@$errors) + { + print "ATTN: can't register DOI $doi due to errors: ", join(', ', @$errors), "\n"; + goto END; + } + + if ($reg_obj->register_doi ($doi, $xml, $url)) + { + print "NOTE: register_doi doi=[$doi] url=[$url] OK\n"; + } + else + { + print "ATTN: register_doi doi=[$doi] url=[$url] was not ok\n"; + } + } + +END: + return; +} + +sub fetch_metadata_bulk +{ + my $utheses_ids= shift; open (TSV, '>:utf8', $fnm_tsv) or die; print TSV join("\t", @TSV_COLUMNS), "\n"; + open (ERRORS, '>>:utf8', '@errors.lst') or die; foreach my $utheses_id (@$utheses_ids) { next if ($utheses_id eq 'utheses_id'); # CSV column name... - print __LINE__, " utheses_id=[$utheses_id]\n"; - my $info= $utapi->getContainerPublicMetadata($utheses_id); - print __LINE__, " info: ", Dumper($info); + my ($row, $xml)= get_utheses_metadata($utheses_id); - if ($info->{status} eq '200' && exists ($info->{metadata}->{thesis}->{doi})) + if (defined ($row)) { - my ($row, $xml)= utheses2datacite($info->{metadata}, $utheses_id); - print __LINE__, " row: ", Dumper($row); - print __LINE__, " xml=[$xml]\n"; - print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n"; - if (defined ($row->{doi})) + + } + } +} + +sub get_utheses_metadata +{ + my $utheses_id= shift; + + print __LINE__, " utheses_id=[$utheses_id]\n"; + my $info= $utapi->getContainerPublicMetadata($utheses_id); + print __LINE__, " info: ", Dumper($info); + + my ($row, $xml); + + if ($info->{status} eq '200') + { + ($row, $xml)= utheses2datacite($info, $utheses_id); + # print __LINE__, " row: ", Dumper($row); + + if (defined ($row->{doi})) + { + $row->{xml_fnm}= my $xml_fnm= 'utheses/DataCite_XML/'. $row->{doi}. '.xml'; + # print __LINE__, " DataCite_XML=[$xml_fnm] xml=[$xml]\n"; + + if (defined ($xml) && defined ($row->{xml_fnm}) && !@{$row->{datacite_conversion_errors}}) { - my $xml_fnm= 'utheses/DataCite_XML/'.$row->{doi}.'.xml'; - open (XML, '>:utf8', $xml_fnm) or die; + open (XML, '>:utf8', $row->{xml_fnm}) or die; # TODO: or do something else... print XML $xml; close (XML); } } } + else + { + printf ERRORS ("utheses_id=%d status=%s\n", $utheses_id, $info->{status}); + } + + return ($row, $xml); } sub utheses2datacite { - my $md= shift; + my $info= shift; my $utheses_id= shift; - my @errors; + my @datacite_conversion_errors; - my ($th, $authors)= map { $md->{$_} } qw(thesis authors); + my ($md)= map { $info->{$_} } qw(metadata); + my ($th, $authors, $phaidra)= map { $md->{$_} } qw(thesis authors phaidra); my ($doi, $nbn, $ac_number, $persistent_link)= map { $th->{$_} } qw(doi urn ac_number persistent_link); # TODO: check doi, nbn, ac_number for syntactic validity @@ -93,17 +207,39 @@ sub utheses2datacite } else { - push (@errors, 'bad_doi'); + push (@datacite_conversion_errors, "bad_doi=[$doi]"); } - push (@errors, 'bad_nbn') unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#); - push (@errors, 'bad_ac_number') unless ($ac_number =~ m#^AC\d{8}$#); + push (@datacite_conversion_errors, "bad_nbn=[$nbn]") unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#); + push (@datacite_conversion_errors, "bad_ac_number=[$ac_number]") unless ($ac_number =~ m#^AC\d{8}$#); - my ($titles, $abstracts, $publication_date, $langs)= map { $th->{$_} } qw(titels abstracts publication_date languages); - my $language= Phaidra::Utils::iso639::iso_639_2_to_1($langs->[0]); - unless (defined ($language)) + push (@datacite_conversion_errors, "bad_url=[$persistent_link]") unless ($persistent_link =~ m#^https://utheses.univie.ac.at/detail/\d+/?$#); + + my ($titles, $abstracts, $publication_date, $langs, $policies)= map { $th->{$_} } qw(titles abstracts publication_date languages policies); + my $fulltext_locked= $policies->{fulltext_locked}; + + my $publication_year; + if ($publication_date =~ m#^(\d{4})-?#) { - $language= 'UNKNOWN'; - push (@errors, 'language'); + $publication_year= $1; + } + else + { + push (@datacite_conversion_errors, "bad_pub_date=[$publication_date]"); + } + + my $main_language; + foreach my $lang (@$langs) + { + my $language= Phaidra::Utils::iso639::iso_639_2_to_1($lang); + if (defined ($language)) + { + $main_language= $language unless (defined ($main_language)); + } + else + { + push (@datacite_conversion_errors, "bad_language=[$lang]"); + $language= ':tba'; + } } my $xml= << "EOX"; @@ -127,23 +263,51 @@ EOX <titles> EOX + foreach my $title (@$titles) + { + my $language= Phaidra::Utils::iso639::iso_639_2_to_1(my $tl= $title->{title_lang}); + unless (defined ($language)) + { + push (@datacite_conversion_errors, "bad_title_language=[$tl]"); + $language= ':tba'; + } + my $tt= 'titleType="TranslatedTitle" ' if ($title->{type} eq 'parallel'); + + my $text= xml_escape($title->{title_text}); + $xml .= <<"EOX"; + <title ${tt}xml:lang="$language"> +$text + </title> +EOX + } + $xml .= << "EOX"; </titles> - <publisher>(:none)</publisher> - <publicationYear>2002</publicationYear> - <language>$language</language> + <publisher>:none</publisher> + <publicationYear>$publication_year</publicationYear> + <language>$main_language</language> <resourceType resourceTypeGeneral="Text">Thesis</resourceType> <alternateIdentifiers> - <alternateIdentifier alternateIdentifierType="nbn">$nbn</alternateIdentifier> + <alternateIdentifier alternateIdentifierType="URL">$persistent_link</alternateIdentifier> + <alternateIdentifier alternateIdentifierType="URN">$nbn</alternateIdentifier> </alternateIdentifiers> <descriptions> EOX foreach my $abstract (@$abstracts) { + my $language= Phaidra::Utils::iso639::iso_639_2_to_1(my $al= $abstract->{language}); + unless (defined ($language)) + { + push (@datacite_conversion_errors, "bad_abstract_language=[$al]"); + $language= ':tba'; + } + + # TODO: text should be escaped, if necessary + my $text= xml_escape($abstract->{text}); $xml .= << "EOX"; - <description descriptionType="Abstract"> -$abstract->{text} + <description descriptionType="Abstract" xml:lang="$language"> +$text </description> EOX } @@ -161,11 +325,25 @@ EOX nbn => $nbn, ac_number => $ac_number, langs => join(',', @$langs), - language => $language, + language => $main_language, persistent_link => $persistent_link, - errors => join(',',@errors), + datacite_conversion_errors => \@datacite_conversion_errors, + fulltext_locked => $fulltext_locked, + phaidra => $phaidra, + policies => $policies, ); (\%row, $xml); } +sub xml_escape +{ + my $s= shift; + + $s=~ s/&/&/g; + $s=~ s/</</g; + $s=~ s/>/>/g; + + $s; +} +