diff --git a/.gitignore b/.gitignore index d6e6b4943a6b7810dfd609802fd8d5322d7adf70..63dcd8d0c54163a0fa05b938ffa8b87c976f99a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +datacite-example-full-v4.0.xml +o_13723.datacite.xml @* *.tsv *.lst diff --git a/dcd.pl b/dcd.pl new file mode 100755 index 0000000000000000000000000000000000000000..1c6fac7bd073b213794ba0c3ee1f525a1abd3f9e --- /dev/null +++ b/dcd.pl @@ -0,0 +1,242 @@ +#!/usr/bin/perl +# $Id: script.pl,v 1.15 2016/09/26 10:00:50 gonter Exp $ + +=head1 NAME + +DataCite DOI + +=cut + +use strict; + +# use FindBin; +use lib 'lib'; + +use Data::Dumper; +$Data::Dumper::Indent= 1; + +# use Module; +use JSON; +use Util::JSON; + +binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; + +use Phaidra::DataCite; +use DataCite::API; + +# binmode( STDERR, ':utf8' ); autoflush STDERR 1; +# binmode( STDIN, ':utf8' ); + +my $config_file= '/etc/irma/DataCite.json'; + +my $x_flag= 0; + +my @PARS; +my $arg; +while (defined ($arg= shift (@ARGV))) +{ + if ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); } + elsif ($arg =~ /^--(.+)/) + { + my ($opt, $val)= split ('=', $1, 2); + if ($opt eq 'help') { usage(); } + else { usage(); } + } + elsif ($arg =~ /^-(.+)/) + { + foreach my $opt (split ('', $1)) + { + if ($opt eq 'h') { usage(); exit (0); } + elsif ($opt eq 'x') { $x_flag= 1; } + else { usage(); } + } + } + else + { + push (@PARS, $arg); + } +} + +# print join (' ', __FILE__, __LINE__, 'caller=['. caller() . ']'), "\n"; + +usage('no arguments') unless (@PARS); + +my $cnf= Util::JSON::read_json_file ($config_file); +# print __LINE__, " cnf: ", main::Dumper ($cnf); exit(0); + +my $op_code= shift (@PARS); +print __LINE__, " op_code=[$op_code]\n"; + +if ($op_code eq 'register') +{ + my $repo_url= shift (@PARS); + register_url ($cnf, $repo_url); +} + +exit (0); + +sub usage +{ + my $msg= join (' ', @_); + if ($msg) { print $msg, "\n"; sleep(3); } + print <<EOX; +usage: $0 [-opts] pars + +template ... + +options: +-h ... help +-x ... set x flag +-- ... remaining args are parameters +EOX + + + exit (0); +} + +sub register_url +{ + my $cnf= shift; + my $repo_url= shift; + + print __LINE__, " repo_url=[$repo_url]\n"; + if ($repo_url =~ m#^https://(phaidra(-(sandbox|temp))?\.univie\.ac\.at)/(o:\d+)$#) + { + my ($repo, $s1, $s2, $pid)= ($1, $2, $3, $4); + usage ("unknown repo=[$repo]") unless (exists ($cnf->{repositories}->{$repo})); + print __LINE__, " repo=[$repo]\n"; + + my $repo_cnf= $cnf->{repositories}->{$repo}; + my $reg_cnf= $cnf->{doi_registries}->{$repo_cnf->{registry}}; + + # print __LINE__, " repo_cnf: ", main::Dumper ($repo_cnf); + # print __LINE__, " reg_cnf: ", main::Dumper ($reg_cnf); + + my $repo_obj= new Phaidra::DataCite (config => $repo_cnf); + # print __LINE__, " repo_obj: ", main::Dumper ($repo_obj); + + # my ($c1, $xml)= $repo_obj->get_metadata ($pid, 'xml'); + my ($c2, $json)= $repo_obj->get_metadata ($pid, 'json'); + my $md= decode_json ($json); + print __LINE__, " md: ", Dumper ($md); + + # TODO: + # * mint new DOI + # * insert DOI in metadata + + my $reg_obj= new DataCite::API (config => $reg_cnf, 'xmode' => 'test'); + my $doi_string= $reg_obj->mint_doi(); + + my $doi_element= + { + xmlname => 'identifier', + value => $doi_string, + attributes => + [ + { + xmlname => 'identifierType', + value => 'DOI' + } + ] + }; + print __LINE__, " doi_element: ", main::Dumper($doi_element); + + unshift (@$md, $doi_element); + + my $xml_new= $repo_obj->json_2_xml ($md); + print __LINE__, " xml_new=[$xml_new]\n"; + + # TODO: interact with the IRMA database to find out if the DOI string is really unique and register in there + + # TODO: interact with DataCite API to register the DOI with the metadata + $reg_obj->register_doi ($doi_string, $xml_new, $repo_url); + } + else + { + usage("unknown repo_url=[$repo_url]"); + } + +} + +# ---------------------------------------------------------------------------- +sub main_function +{ + my $fnm= shift; + print "main_function: $fnm\n"; + hex_dump_file ($fnm); +} + +# ---------------------------------------------------------------------------- +sub hex_dump_file +{ + my $fnm= shift; + my $buffer_size= shift || 4 * 1048576; + # my $buffer_size= shift || 32*1024; + + open (FI, '<:raw', $fnm) or die "can not read [$fnm]\n"; + my $buffer; + my $segment= 0; + while (1) + { + my $rc1= sysread (FI, $buffer, $buffer_size); + printf ("segment=[%s] fnm=[%d] rc1=[0x%08lx] (bs=0x%08lx)\n", $segment++, $fnm, $rc1, $buffer_size); + hex_dump ($buffer); + last if ($rc1 < $buffer_size); + } + close (FI); +} + +# ---------------------------------------------------------------------------- +sub hex_dump +{ + my $data= shift; + local *FX= shift || *STDOUT; + + my $off= 0; + my ($i, $c, $v); + + my $run= 1; + DATA: while ($run) + { + my $char= ''; + my $hex= ''; + my $offx= sprintf ('%08X', $off); + + for ($i= 0; $i < 16; $i++) + { + $c= substr ($data, $off+$i, 1); + + if ($i == 8) + { + $hex .= ' '; + } + + if ($c ne '') + { + # $data= substr ($data, 1); + $v= unpack ('C', $c); + $c= '.' if ($v < 0x20 || $v >= 0x7F); + + $char .= $c; + $hex .= sprintf (' %02X', $v); + } + else + { + $char .= ' '; + $hex .= ' '; + $run= 0; + } + } + + print FX "$offx $hex |$char|\n"; + + $off += 0x10; + } +} + +__END__ + +=head1 AUTHOR + +Firstname Lastname <address@example.org> + diff --git a/lib/DataCite/API.pm b/lib/DataCite/API.pm new file mode 100644 index 0000000000000000000000000000000000000000..2b78f26b806c84a62eca91e2f7343048965b5097 --- /dev/null +++ b/lib/DataCite/API.pm @@ -0,0 +1,133 @@ + +=head1 NAME + + DataCite::API + +=head1 DESCRIPTION + +Interact with DataCitee API ([1], [2]) + +=head1 REFERENCES + +* [1] https://mds.datacite.org/static/apidoc +* [2] https://mds.test.datacite.org/static/apidoc +* [3] http://schema.test.datacite.org/ +* [4] https://blog.datacite.org/cool-dois/ + +=cut + +package DataCite::API; + +use strict; +use utf8; + +use base 'Module'; + +use Encode; +use LWP; +# use Crypt::SSLeay; +use HTTP::Headers::Util; + +use Encode::Base32::Crockford; + +sub mint_doi +{ + my $self= shift; + + # my $prefix= ($self->{mode} eq 'test') ? $cfg->{test_prefix} : $cfg->{prefix}; + + return join ('/', $self->{config}->{prefix}, mk_doi()), +} + +sub register_doi +{ + my $self= shift; + my $doi= shift; + my $metadata_xml= shift; + my $repo_url= shift; + + my ($code1, $res1)= $self->datacite_request ('POST', 'metadata', $metadata_xml, 'application/xml;charset=UTF-8'); + print __LINE__, " code1=[$code1] res1=[$res1]\n"; + unless ($code1 =~ m#^20[01]#) + { + print STDERR "ATTN: register_doi POST metadata returned code1=[$code1] res1=[$res1]\n"; + return undef; + } + + my $doi_reg = <<"EOX"; +doi=$doi +url=$repo_url +EOX + + my ($code2, $res2)= $self->datacite_request ('POST', 'doi', $doi_reg, 'application/xml;charset=UTF-8'); + print __LINE__, " code2=[$code2] res2=[$res2]\n"; + + unless ($code2 =~ m#^20[01]#) + { + print STDERR "ATTN: register_doi POST doi returned code2=[$code2] res2=[$res2]\n"; + return undef; + } +} + +sub datacite_request +{ + my $self= shift; + my $method= shift; + my $verb= shift; + my $content= shift; + my $content_type= shift; + + my ($api_url, $username, $password)= map { $self->{config}->{$_} } qw(api_url username password); + + my $req_url= join ('/', $api_url, $verb); + print __LINE__, " method=[$method] req_url=[$req_url]\n"; + + # build request + my $headers = HTTP::Headers->new( + 'Accept' => 'application/xml', + 'Content-Type' => $content_type + ); + + my $req = HTTP::Request->new( + $method => $req_url, + $headers, Encode::encode_utf8( $content ) + ); + $req->authorization_basic($username, $password); + + # pass request to the user agent and get a response back + my $ua= LWP::UserAgent->new; + my $res= $ua->request($req); + + return ($res->code(), $res->content()); +} + +=head1 INTERNAL FUNCTIONS + +=head2 $doi= mk_doi() + +actually mint a DOI according to [4] + +=cut + +sub mk_doi +{ + my $tries= 3; + my $str; + + do + { + my $num= rand(33285996544) + 1073741824; + # my $num= rand(33286544) + 107824; + $str= Encode::Base32::Crockford::base32_encode_with_checksum($num); + # printf ("%12d %s\n", $num, $str); + + return join ('-', $1, $2) if ($str =~ m#^([A-Z0-9]{4})([A-Z0-9]{4})$#); + } while ($tries-- > 0); + +# TODO: print some kind of warning? + + return ($str); +} + +1; + diff --git a/lib/Module.pm b/lib/Module.pm new file mode 100644 index 0000000000000000000000000000000000000000..306c9e56f2fd02e5eae9b4afedd844013a56840f --- /dev/null +++ b/lib/Module.pm @@ -0,0 +1,88 @@ +#!/usr/bin/perl +# +# $Id: Module.pm,v 1.5 2014/04/17 09:55:24 gonter Exp $ + +use strict; + +package Module; + +__PACKAGE__->main() unless caller(); + +sub main +{ + print join (' ', __FILE__, __LINE__, 'main: caller=['. caller(). ']'), "\n"; +} + +sub new +{ + my $class= shift; + + my $obj= {}; + bless $obj, $class; + + $obj->set (@_); + + $obj; +} + +sub set +{ + my $obj= shift; + my %par= @_; + + my %res; + foreach my $par (keys %par) + { + $res{$par}= $obj->{$par}; + $obj->{$par}= $par{$par}; + } + + (wantarray) ? %res : \%res; +} + +sub get_array +{ + my $obj= shift; + my @par= @_; + + my @res; + foreach my $par (@par) + { + push (@res, $obj->{$par}); + } + + (wantarray) ? @res : \@res; +} + +sub get_hash +{ + my $obj= shift; + my @par= @_; + + my %res; + foreach my $par (@par) + { + $res{$par}= $obj->{$par}; + } + + (wantarray) ? %res : \%res; +} + +*get= *get_array; + +1; + +__END__ + +=head1 NAME + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 BUGS + +=head1 REFERENCES + +=head1 AUTHOR + diff --git a/lib/Phaidra/DataCite.pm b/lib/Phaidra/DataCite.pm new file mode 100644 index 0000000000000000000000000000000000000000..0ae63f4624efd763e14454ee5c00097355bf7af6 --- /dev/null +++ b/lib/Phaidra/DataCite.pm @@ -0,0 +1,159 @@ + +=head1 NAME + + Phaidra::DataCite + +=head1 DESCRIPTION + +Interact with a Phaidra instance to retrieve data relevant for DataCite. + + +=cut + +package Phaidra::DataCite; + +use strict; +use utf8; + +use base 'Module'; + +use LWP; +# use Crypt::SSLeay; +use HTTP::Headers::Util; + +use XML::LibXML; +use XML::Writer; + +my $datacite_ns= 'http://datacite.org/schema/kernel-4'; +my $datacite_schemaLocation= 'http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd'; + +sub get_metadata +{ + my $obj= shift; + my $pid= shift; + my $fmt= shift || 'json'; # TODO ... + + my $accept= 'application/json'; + my $url_md= join ('/', $obj->{config}->{api_url}, 'object', $pid, 'datacite'); + + if ($fmt eq 'xml') + { + $url_md .= '?format=xml'; + $accept= 'application/json'; + } + + print __LINE__, " fmt=[$fmt] url_md=[$url_md]\n"; + +# build request + my $headers = HTTP::Headers->new( + 'Accept' => $accept, + # 'Content-Type' => $content_type + ); + + my $req = HTTP::Request->new('GET' => $url_md, $headers); + # not needed here $req->authorization_basic($user_name, $user_pw); + + # pass request to the user agent and get a response back + my $ua= LWP::UserAgent->new; + my $res= $ua->request($req); + # print __LINE__, " res=[$res] ", main::Dumper ($res); + + my $code= $res->code(); + my $content= $res->content(); + # print __LINE__, " code=[$code] content=[$content]\n"; + # main::hex_dump ($content); + + # TODO/HACK: fix double UTF8 encoding handed down from PhaidraAPI + # ... + + ($code, $content); +} + +# stolen from PhaidraAPI/Model/Datacite.pm +sub json_2_xml +{ + my $self= shift; + my $json= shift; + + my $prefixmap = { + $datacite_ns => 'datacite', + 'http://www.w3.org/2001/XMLSchema-instance' => 'xsi' + }; + my $forced_declarations = [ + $datacite_ns, + 'http://www.w3.org/2001/XMLSchema-instance' + ]; + + my $xml= <<"EOX"; +<?xml version="1.0" encoding="UTF-8"?> +EOX + + my $writer = XML::Writer->new( + OUTPUT => \$xml, + # NAMESPACES => 1, + PREFIX_MAP => $prefixmap, + # FORCED_NS_DECLS => $forced_declarations, + DATA_MODE => 1, + ENCODING => 'utf-8' + ); + + $writer->startTag('resource', 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance', 'xmlns' => 'http://datacite.org/schema/kernel-4', 'xsi:schemaLocation' => $datacite_schemaLocation); + json_2_xml_rec(undef, $json, $writer); + $writer->endTag('resource'); + + $writer->end(); + + return $xml; +} + +=head1 INTERNAL FUNCTIONS + +=head2 $xml= json_2_xml_rec($json) + +=cut + +sub json_2_xml_rec +{ + my $parent = shift; + my $children = shift; + my $writer = shift; + + foreach my $child (@{$children}){ + + my $children_size = defined($child->{children}) ? scalar (@{$child->{children}}) : 0; + my $attributes_size = defined($child->{attributes}) ? scalar (@{$child->{attributes}}) : 0; + + if((!defined($child->{value}) || ($child->{value} eq '')) && $children_size == 0 && $attributes_size == 0){ + next; + } + + if (defined($child->{attributes}) && (scalar @{$child->{attributes}} > 0)){ + my @attrs; + foreach my $a (@{$child->{attributes}}){ + if(defined($a->{value}) && $a->{value} ne '') + { + if($a->{xmlname} eq 'DOESNOTWORK_lang'){ + push @attrs, ['http://www.w3.org/XML/1998/namespace', 'lang'] => $a->{value}; + }else{ + push @attrs, $a->{xmlname} => $a->{value}; + } + } + } + + $writer->startTag($child->{xmlname}, @attrs); + }else{ + $writer->startTag($child->{xmlname}); + } + + if($children_size > 0){ + json_2_xml_rec($child, $child->{children}, $writer); + }else{ + $writer->characters($child->{value}); + } + + $writer->endTag($child->{xmlname}); + } +} + +1; + diff --git a/mintdoi.pl b/mintdoi.pl new file mode 100755 index 0000000000000000000000000000000000000000..e1659815955163868687f0cc1db913e93315727d --- /dev/null +++ b/mintdoi.pl @@ -0,0 +1,49 @@ +#!/usr/bin/perl + +use strict; + +use Data::Dumper; +$Data::Dumper::Indent= 1; + +use Encode::Base32::Crockford; + +# check(); exit; + +my $cnt= 10_000_000; + +while ($cnt-- > 0) +{ + my $str= mk_doi(); + # next unless (defined ($str)); + print $str, "\n"; +} + +exit(0); + +sub mk_doi +{ + my $tries= 3; + my $str; + + do + { + my $num= rand(33285996544) + 1073741824; + # my $num= rand(33286544) + 107824; + $str= Encode::Base32::Crockford::base32_encode_with_checksum($num); + # printf ("%12d %s\n", $num, $str); + + return join ('-', $1, $2) if ($str =~ m#^([A-Z0-9]{4})([A-Z0-9]{4})$#); + } while ($tries-- > 0); + + return ($str); +} + +sub check +{ + for (my $num= 1073741824;; $num += 1048576) + { + my $str= Encode::Base32::Crockford::base32_encode_with_checksum($num); + printf ("%12d %s\n", $num, $str); + } +} +