From dcb166af9d7a74ffc1ce0c8704f6e250b34758d4 Mon Sep 17 00:00:00 2001
From: Gerhard Gonter <ggonter@gmail.com>
Date: Wed, 20 Apr 2022 19:29:01 +0200
Subject: [PATCH] first shot at the Utheses API

---
 lib/Univie/Utheses/API.pm |  67 +++++++++++++++
 ut1.pl                    | 171 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 lib/Univie/Utheses/API.pm
 create mode 100755 ut1.pl

diff --git a/lib/Univie/Utheses/API.pm b/lib/Univie/Utheses/API.pm
new file mode 100644
index 0000000..847a3ce
--- /dev/null
+++ b/lib/Univie/Utheses/API.pm
@@ -0,0 +1,67 @@
+#!/usr/bin/perl
+
+use strict;
+use utf8;
+
+package Univie::Utheses::API;
+
+use base 'Module';
+
+use LWP;
+use JSON -convert_blessed_universally;
+use Encode;
+
+=head2 my $utapi= new Univie::Utheses::API( api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at/' );
+
+=cut
+
+sub getContainerPublicMetadata
+{
+  my $self= shift;
+  my $utheses_id= shift;
+
+  my ($code1, $res1)= $self->utheses_request('GET', 'container/get/public', $utheses_id);
+  print __LINE__, " code1=[$code1] res1=[$res1]\n";
+
+  my $info;
+  if ($code1 =~ m#^2#)
+  {
+    eval
+    {
+      # utf8::upgrade($res1);
+      $info= from_json(decode("utf8", $res1));
+    };
+    if ($@)
+    {
+      die $@;
+    }
+  }
+
+  $info;
+}
+
+sub utheses_request
+{
+  my $self=   shift;
+  my $method= shift;
+  my $what=   shift;
+  my $id=     shift;
+
+  my ($api_url)= map { $self->{config}->{$_} } qw(api_url);
+
+  my $req_url= join ('/', $api_url, $what, $id);
+  print __LINE__, " req_url=[$req_url]\n";
+  my $req = HTTP::Request->new(
+    $method => $req_url,
+  );
+
+  my $ua= LWP::UserAgent->new;
+  my $res= $ua->request($req);
+
+  # print __LINE__, " res: ", main::Dumper($res);
+  my $txt= $res->content();
+  return ($res->code(), $txt);
+}
+
+1;
+
diff --git a/ut1.pl b/ut1.pl
new file mode 100755
index 0000000..e8ee019
--- /dev/null
+++ b/ut1.pl
@@ -0,0 +1,171 @@
+#!/usr/bin/perl
+
+use strict;
+
+use FileHandle;
+
+binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
+binmode( STDERR, ':utf8' ); autoflush STDERR 1;
+binmode( STDIN,  ':utf8' );
+
+use Data::Dumper;
+$Data::Dumper::Indent= 1;
+
+use lib 'lib';
+use Univie::Utheses::API;
+use Phaidra::Utils::iso639;
+
+# my %LANGUAGES= ( 'deu' => 'de', 'eng' => 'en' ); not used, see Phaidra::Utils::iso639
+
+my @TSV_COLUMNS= qw( utheses_id suffix doi nbn ac_number langs language persistent_link errors );
+
+my $op_mode= 'fetch_metadata';
+my $fnm_tsv= 'utheses/utheses_info.tsv'; # TODO: timestamp!
+
+my @pars= ();
+while (my $arg= shift (@ARGV))
+{
+  if ($arg =~ /^-/)
+  {
+  }
+  else
+  {
+    push (@pars, $arg);
+  }
+}
+
+if ($op_mode eq 'fetch_metadata')
+{
+  fetch_metadata(\@pars);
+}
+exit(0);
+
+sub fetch_metadata
+{
+  my $utheses_ids= shift;
+
+  my $utapi= new Univie::Utheses::API( config => { api_url => 'https://utheses-api-utheses-prod.cprod.univie.ac.at' } );
+
+  open (TSV, '>:utf8', $fnm_tsv) or die;
+  print TSV join("\t", @TSV_COLUMNS), "\n";
+
+  foreach my $utheses_id (@$utheses_ids)
+  {
+    next if ($utheses_id eq 'utheses_id'); # CSV column name...
+
+    print __LINE__, " utheses_id=[$utheses_id]\n";
+    my $info= $utapi->getContainerPublicMetadata($utheses_id);
+    print __LINE__, " info: ", Dumper($info);
+
+    if ($info->{status} eq '200' && exists ($info->{metadata}->{thesis}->{doi}))
+    {
+      my ($row, $xml)= utheses2datacite($info->{metadata}, $utheses_id);
+      print __LINE__, " row: ", Dumper($row);
+      print __LINE__, " xml=[$xml]\n";
+
+      print TSV join("\t", map { $row->{$_} } @TSV_COLUMNS), "\n";
+      if (defined ($row->{doi}))
+      {
+        my $xml_fnm= 'utheses/DataCite_XML/'.$row->{doi}.'.xml';
+        open (XML, '>:utf8', $xml_fnm) or die;
+        print XML $xml;
+        close (XML);
+      }
+    }
+  }
+}
+
+sub utheses2datacite
+{
+  my $md= shift;
+  my $utheses_id= shift;
+
+  my @errors;
+
+  my ($th, $authors)= map { $md->{$_} } qw(thesis authors);
+
+  my ($doi, $nbn, $ac_number, $persistent_link)= map { $th->{$_} } qw(doi urn ac_number persistent_link);
+  # TODO: check doi, nbn, ac_number for syntactic validity
+  my $suffix;
+  if ($doi =~ m#^10.25365/(thesis\.\d+)$#)
+  {
+    $suffix= $1;
+  }
+  else
+  {
+    push (@errors, 'bad_doi');
+  }
+  push (@errors, 'bad_nbn') unless ($nbn =~ m#^urn:nbn:at:at-ubw:1-\d{5}\.\d{5}\.\d{6}-\d$#);
+  push (@errors, 'bad_ac_number') unless ($ac_number =~ m#^AC\d{8}$#);
+
+  my ($titles, $abstracts, $publication_date, $langs)= map { $th->{$_} } qw(titels abstracts publication_date languages);
+  my $language= Phaidra::Utils::iso639::iso_639_2_to_1($langs->[0]);
+  unless (defined ($language))
+  {
+    $language= 'UNKNOWN';
+    push (@errors, 'language');
+  }
+
+  my $xml= << "EOX";
+<?xml version="1.0" encoding="utf-8"?>
+<resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">
+  <identifier identifierType="DOI">$doi</identifier>
+  <creators>
+EOX
+
+  foreach my $author (@$authors)
+  {
+    $xml .= << "EOX";
+    <creator>
+      <creatorName>$author->{family_name}, $author->{given_name}</creatorName>
+    </creator>
+EOX
+  }
+
+  $xml .= << "EOX";
+  </creators>
+  <titles>
+EOX
+
+  $xml .= << "EOX";
+  </titles>
+  <publisher>(:none)</publisher>
+  <publicationYear>2002</publicationYear>
+  <language>$language</language>
+  <resourceType resourceTypeGeneral="Text">Thesis</resourceType>
+  <alternateIdentifiers>
+    <alternateIdentifier alternateIdentifierType="nbn">$nbn</alternateIdentifier>
+  </alternateIdentifiers>
+  <descriptions>
+EOX
+
+  foreach my $abstract (@$abstracts)
+  {
+    $xml .= << "EOX";
+    <description descriptionType="Abstract">
+$abstract->{text}
+    </description>
+EOX
+  }
+
+  $xml .= << "EOX";
+  </descriptions>
+</resource>
+EOX
+
+  my %row=
+  (
+    utheses_id => $utheses_id,
+    suffix => $suffix,
+    doi => $doi,
+    nbn => $nbn,
+    ac_number => $ac_number,
+    langs => join(',', @$langs),
+    language => $language,
+    persistent_link => $persistent_link,
+    errors => join(',',@errors),
+  );
+
+  (\%row, $xml);
+}
+
-- 
GitLab