From dcc987a2836d100fe69160f2184ae4c2fd20579b Mon Sep 17 00:00:00 2001 From: Gerhard Gonter <ggonter@gmail.com> Date: Sun, 12 Jul 2020 13:14:59 +0200 Subject: [PATCH] parser for static othes abstract files in index.html --- parse_othes_index.pl | 315 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100755 parse_othes_index.pl diff --git a/parse_othes_index.pl b/parse_othes_index.pl new file mode 100755 index 0000000..b45e9ef --- /dev/null +++ b/parse_othes_index.pl @@ -0,0 +1,315 @@ +#!/usr/bin/perl + +use strict; + +use Data::Dumper; +$Data::Dumper::Indent= 1; +$Data::Dumper::Sortkeys= 1; + +use FileHandle; + +binmode( STDIN, ':utf8' ); +binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; +binmode( STDERR, ':utf8' ); autoflush STDERR 1; + +use HTML::TreeBuilder 5 -weak; # Ensure weak references in use + +use Util::JSON; + +use FindBin; +use lib join('/', $FindBin::Bin, 'lib'); +use IRMA::db; + +my $config_fnm= '/etc/irma/eprints.json'; + +my $force= 0; + +my @PARS; +my $arg; +while (defined ($arg= shift (@ARGV))) +{ + if ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); } + elsif ($arg =~ /^--(.+)/) + { + my ($opt, $val)= split ('=', $1, 2); + if ($opt eq 'help') { usage(); } + elsif ($opt eq 'force') { $force= (defined ($val)) ? $val : 1; } + else { usage(); } + } + elsif ($arg =~ /^-(.+)/) + { + foreach my $opt (split ('', $1)) + { + if ($opt eq 'h') { usage(); exit (0); } + elsif ($opt eq 'F') { $force= 1; } + else { usage(); } + } + } + else + { + push (@PARS, $arg); + } +} + +my $cnf= Util::JSON::read_json_file ($config_fnm); +my $ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database'); +my $col_othes_policy= $ot2ut->get_collection('othes.policy'); +# print __LINE__, " ot2ut=[$ot2ut]\n"; + +PAR: foreach my $fnm (@PARS) +{ + update_file($fnm); +} +exit(0); + +sub update_file +{ + my $fnm= shift; + + my @fnm= split('/', $fnm); + my $last= pop (@fnm); + + unless ($last eq 'index.html') + { + print __LINE__, " ATTN: not an index.html: last=[$last]\n"; + return undef; + } + + my @last4= splice(@fnm, -4); + my $eprint_id= join('', @last4)+0; + # print __LINE__, " last4=[$last4] [",join(':', @last4), "]\n"; return undef; + + my @st= stat($fnm); + return undef unless (@st); + my $mtime= $st[9]; + + my $search= { eprint_id => "$eprint_id" }; + # print __LINE__, " search: ", Dumper($search); + my $policy_rec= $col_othes_policy->find_one($search); + # print __LINE__, " policy_rec=[$policy_rec]\n"; + if (defined ($policy_rec) && $policy_rec->{mtime} == $mtime && !$force) + { + # print __LINE__, " already processed; skipping\n"; + # print __LINE__, " ", Dumper($policy_rec); + return undef; + } + + my $policy_info= process_index_file($fnm); + + $policy_info->{mtime}= $mtime; + + # delete ($policy_info->{document_paths}); + print __LINE__, " policy_info: ", Dumper($policy_info); + + if (defined ($policy_rec)) + { + my $rc_upd= $col_othes_policy->update( { _id => $policy_rec->{_id} }, $policy_info ); + print __LINE__, " policy update: rc_upd=[$rc_upd] ", Dumper($rc_upd); + } + else + { + if (exists ($policy_info->{eprint_id})) + { + my $rc_ins= $col_othes_policy->insert( $policy_info ); + print __LINE__, " policy insert: rc_ins=[$rc_ins] ", Dumper($rc_ins); + } + else + { + print __LINE__, " WARNING: eprint_id missing\n"; + } + } +} + +sub parse_document_url +{ + my $href= shift; + + if ($href =~ m#https?://othes.univie.ac.at/((\d+)/(\d+)/([^/]+))$#) + { + my ($document_path, $eprint_id, $seq_num, $filename)= ($1, $2, $3, $4); + + return + { + document_path => $document_path, + document_url => $href, + eprint_id => $eprint_id, + seq_num => $seq_num, + filename => $filename, + is_downloadable => 0, + }; + } + + undef; +} + +sub process_index_file +{ + my $fnm= shift; + + my $tree= HTML::TreeBuilder->new; # empty tree + $tree->parse_file($fnm); + + # print "Hey, here's a dump of the parse tree=[$tree] of fnm=[$fnm]:\n"; + + # $tree->dump; # a method we inherit from HTML::Element + # print "And here it is, bizarrely rerendered as HTML:\n", $tree->as_HTML, "\n"; + +=begin comment + + print "\n\n", '='x70,"\n"; + print "ALL HTML\n"; + print '='x70,"\n"; + $tree->dump(); + # $tree->as_HTML(); + print '='x70,"\n"; + +=end comment +=cut + +=begin comment + + my @nodes= $tree->content_list(); + print __LINE__, " nodes: ", scalar @nodes, "\n"; + foreach my $node (@nodes) + { + print __LINE__, " node=[$node]\n"; + print $node->dump(); + } + +=end comment +=cut + + my %result= + ( + filename => $fnm, + # mtimte => $mtime, + ); + my $cnt_warnings= 0; + my %document_paths= (); + + # 1. check meta elements + # my @meta_elements= $tree->find_by_tag_name('meta'); + my @meta_elements= $tree->look_down( _tag => 'meta' ); + # print __LINE__, " meta_elements: ", scalar @meta_elements, "\n"; + foreach my $meta_element (@meta_elements) + { + # print $meta_element->dump(); + my $name= $meta_element->attr('name'); + my $content= $meta_element->attr('content'); + # print __LINE__, " meta: name=[$name] content=[$content]\n"; + + if ($name eq 'eprints.document_url') + { + my $res= parse_document_url($content); + + if (defined ($res)) + { + $document_paths{$res->{document_path}}= $res; + } + else + { + push (@{$result{unknown_document_paths_meta}}, $content); + $cnt_warnings++; + } + } + elsif ($name =~ m#eprints\.(full_text_status|ispublished|metadata_visibility)#) { my $an= $1; $result{$an}= $content; } + elsif ($name eq 'DC.relation') + { + if ($content =~ m#https?://othes.univie.ac.at/(\d+)/$#) + { + $result{eprint_id}= $1; + $result{othes_url}= $content; + } + } + } + +=begin comment + + my @doc_cit= $tree->look_down( _tag => 'span', class => 'ep_document_citation' ); + foreach my $doc_cit (@doc_cit) + { + print __LINE__, " doc_cit:\n"; + print '='x70,"\n"; + print $doc_cit->dump(); + # print $doc_cit->as_HTML(), "\n"; + } + +=end comment +=cut + + my @anchors= $tree->look_down( _tag => 'a' ); + my %hrefs= (); + foreach my $anchor (@anchors) + { + my $href= $anchor->attr('href'); + # print __LINE__, " anchor: href=[$href]\n"; + $hrefs{$href}++; + + # print '='x70,"\n"; + # print $anchor->dump(); + # print $div1->as_HTML(), "\n"; + } + + # print __LINE__, " hrefs: ", Dumper(\%hrefs); + foreach my $href (keys %hrefs) + { + if ($href =~ m#https?://othes.univie.ac.at/((\d+)/(\d+)/([^/]+))$#) + { + my $res= parse_document_url($href); + + if (defined ($res)) + { + if (exists ($document_paths{$res->{document_path}})) + { + $document_paths{$res->{document_path}}->{is_downloadable}= 1; + } + else + { + push (@{$result{unmatched_document_paths_href}}, $href); + $cnt_warnings++; + } + } + else + { + push (@{$result{unknown_document_paths_href}}, $href); + $cnt_warnings++; + } + } + elsif ($href =~ m#https?://resolver.obvsg.at/(urn:nbn:at:.*)#) + { + $result{urn}= $1; + $result{urn_resolver_url}= $href; + } + elsif ($href =~ m#https?://ubdata.univie.ac.at/(AC\d{8})#) + { + $result{ac_number}= $1; + $result{catalog_url}= $href; + } + elsif ($href =~ m#https?://doi\.org/(.*)#) + { + $result{doi}= $1; + $result{doi_resolver_url}= $href; + } + } + + # Now that we're done with it, we must destroy it. + # $tree = $tree->delete; # Not required with weak references + + my ($cnt_docs, $cnt_public, $cnt_restricted)= (0, 0, 0); + my @document_paths; + foreach my $dp (keys %document_paths) + { + my $x= $document_paths{$dp}; + if ($x->{is_downloadable}) { $cnt_public++; } else { $cnt_restricted++; } + $cnt_docs++; + } + + $result{cnt_docs}= $cnt_docs; + $result{cnt_public}= $cnt_public; + $result{cnt_restricted}= $cnt_restricted; + $result{cnt_warnings}= $cnt_warnings; + $result{document_paths}= [ values %document_paths ]; + + \%result; +} + -- GitLab