From dcb166af9d7a74ffc1ce0c8704f6e250b34758d4 Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Wed, 20 Apr 2022 19:29:01 +0200 Subject: [PATCH] first shot at the Utheses API --- lib/Univie/Utheses/API.pm | 67 +++++++++++++++ ut1.pl | 171 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 lib/Univie/Utheses/API.pm create mode 100755 ut1.pl diff --git a/lib/Univie/Utheses/API.pm b/lib/Univie/Utheses/API.pm new file mode 100644 index 0000000..847a3ce --- /dev/null +++ b/lib/Univie/Utheses/API.pm @@ -0,0 +1,67 @@ +#!/usr/bin/perl + +use strict; +use utf8; + +package Univie::Utheses::API; + +use base 'Module'; + +use LWP; +use JSON -convert_blessed_universally; +use Encode; + +=head2 my $utapi= new Univie::Utheses::API( api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at/' ); + +=cut + +sub getContainerPublicMetadata +{ + my $self= shift; + my $utheses_id= shift; + + my ($code1, $res1)= $self->utheses_request('GET', 'container/get/public', $utheses_id); + print __LINE__, " code1=[$code1] res1=[$res1]\n"; + + my $info; + if ($code1 =~ m#^2#) + { + eval + { + # utf8::upgrade($res1); + $info= from_json(decode("utf8", $res1)); + }; + if ($@) + { + die $@; + } + } + + $info; +} + +sub utheses_request +{ + my $self= shift; + my $method= shift; + my $what= shift; + my $id= shift; + + my ($api_url)= map { $self->{config}->{$_} } qw(api_url); + + my $req_url= join ('/', $api_url, $what, $id); + print __LINE__, " req_url=[$req_url]\n"; + my $req = HTTP::Request->new( + $method => $req_url, + ); + + my $ua= LWP::UserAgent->new; + my $res= $ua->request($req); + + # print __LINE__, " res: ", main::Dumper($res); + my $txt= $res->content(); + return ($res->code(), $txt); +} + +1; + diff --git a/ut1.pl b/ut1.pl new file mode 100755 index 0000000..e8ee019 --- /dev/null +++ b/ut1.pl @@ -0,0 +1,171 @@ +#!/usr/bin/perl + +use strict; + +use FileHandle; + +binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; +binmode( STDERR, ':utf8' ); autoflush STDERR 1; +binmode( STDIN, ':utf8' ); + +use Data::Dumper; +$Data::Dumper::Indent= 1; + +use lib 'lib'; +use Univie::Utheses::API; +use Phaidra::Utils::iso639; + +# my %LANGUAGES= ( 'deu' => 'de', 'eng' => 'en' ); not used, see Phaidra::Utils::iso639 + +my @TSV_COLUMNS= qw( utheses_id suffix doi nbn ac_number langs language persistent_link errors ); + +my $op_mode= 'fetch_metadata'; +my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp! + +my @pars= (); +while (my $arg= shift (@ARGV)) +{ + if ($arg =~ /^-/) + { + } + else + { + push (@pars, $arg); + } +} + +if ($op_mode eq 'fetch_metadata') +{ + fetch_metadata(\@pars); +} +exit(0); + +sub fetch_metadata +{ + my $utheses_ids= shift; + + my $utapi= new Univie::Utheses::API( config => { api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at' } ); + + open (TSV, '>:utf8', $fnm_tsv) or die; + print TSV join("\t", @TSV_COLUMNS), "\n"; + + foreach my $utheses_id (@$utheses_ids) + { + next if ($utheses_id eq 'utheses_id'); # CSV column name... + + print __LINE__, " utheses_id=[$utheses_id]\n"; + my $info= $utapi->getContainerPublicMetadata($utheses_id); + print __LINE__, " info: ", Dumper($info); + + if ($info->{status} eq '200' && exists ($info->{metadata}->{thesis}->{doi})) + { + my ($row, $xml)= utheses2datacite($info->{metadata}, $utheses_id); + print __LINE__, " row: ", Dumper($row); + print __LINE__, " xml=[$xml]\n"; + + print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n"; + if (defined ($row->{doi})) + { + my $xml_fnm= 'utheses/DataCite_XML/'.$row->{doi}.'.xml'; + open (XML, '>:utf8', $xml_fnm) or die; + print XML $xml; + close (XML); + } + } + } +} + +sub utheses2datacite +{ + my $md= shift; + my $utheses_id= shift; + + my @errors; + + my ($th, $authors)= map { $md->{$_} } qw(thesis authors); + + my ($doi, $nbn, $ac_number, $persistent_link)= map { $th->{$_} } qw(doi urn ac_number persistent_link); + # TODO: check doi, nbn, ac_number for syntactic validity + my $suffix; + if ($doi =~ m#^10.25365/(thesis\.\d+)$#) + { + $suffix= $1; + } + else + { + push (@errors, 'bad_doi'); + } + push (@errors, 'bad_nbn') unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#); + push (@errors, 'bad_ac_number') unless ($ac_number =~ m#^AC\d{8}$#); + + my ($titles, $abstracts, $publication_date, $langs)= map { $th->{$_} } qw(titels abstracts publication_date languages); + my $language= Phaidra::Utils::iso639::iso_639_2_to_1($langs->[0]); + unless (defined ($language)) + { + $language= 'UNKNOWN'; + push (@errors, 'language'); + } + + my $xml= << "EOX"; +<?xml version="1.0" encoding="utf-8"?> +<resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd"> + <identifier identifierType="DOI">$doi</identifier> + <creators> +EOX + + foreach my $author (@$authors) + { + $xml .= << "EOX"; + <creator> + <creatorName>$author->{family_name}, $author->{given_name}</creatorName> + </creator> +EOX + } + + $xml .= << "EOX"; + </creators> + <titles> +EOX + + $xml .= << "EOX"; + </titles> + <publisher>(:none)</publisher> + <publicationYear>2002</publicationYear> + <language>$language</language> + <resourceType resourceTypeGeneral="Text">Thesis</resourceType> + <alternateIdentifiers> + <alternateIdentifier alternateIdentifierType="nbn">$nbn</alternateIdentifier> + </alternateIdentifiers> + <descriptions> +EOX + + foreach my $abstract (@$abstracts) + { + $xml .= << "EOX"; + <description descriptionType="Abstract"> +$abstract->{text} + </description> +EOX + } + + $xml .= << "EOX"; + </descriptions> +</resource> +EOX + + my %row= + ( + utheses_id => $utheses_id, + suffix => $suffix, + doi => $doi, + nbn => $nbn, + ac_number => $ac_number, + langs => join(',', @$langs), + language => $language, + persistent_link => $persistent_link, + errors => join(',',@errors), + ); + + (\%row, $xml); +} + -- GitLab