parser for static othes abstract files in index.html

dcc987a2 · Gerhard Gonter · 5ca6244f · dcc987a2
Commit dcc987a2 authored 4 years ago by Gerhard Gonter
--- a/parse_othes_index.pl
+++ b/parse_othes_index.pl
+#!/usr/bin/perl
+
+use strict;
+
+use Data::Dumper;
+$Data::Dumper::Indent= 1;
+$Data::Dumper::Sortkeys= 1;
+
+use FileHandle;
+
+binmode( STDIN,  ':utf8' );
+binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
+binmode( STDERR, ':utf8' ); autoflush STDERR 1;
+
+use HTML::TreeBuilder 5 -weak; # Ensure weak references in use
+
+use Util::JSON;
+
+use FindBin;
+use lib join('/', $FindBin::Bin, 'lib');
+use IRMA::db;
+
+my $config_fnm= '/etc/irma/eprints.json';
+
+my $force= 0;
+
+my @PARS;
+my $arg;
+while (defined ($arg= shift (@ARGV)))
+{
+  if ($arg eq '--') { push (@PARS, @ARGV); @ARGV= (); }
+  elsif ($arg =~ /^--(.+)/)
+  {
+    my ($opt, $val)= split ('=', $1, 2);
+    if ($opt eq 'help') { usage(); }
+    elsif ($opt eq 'force') { $force= (defined ($val)) ? $val : 1; }
+    else { usage(); }
+  }
+  elsif ($arg =~ /^-(.+)/)
+  {
+    foreach my $opt (split ('', $1))
+    {
+         if ($opt eq 'h') { usage(); exit (0); }
+      elsif ($opt eq 'F') { $force= 1; }
+      else { usage(); }
+    }
+  }
+  else
+  {
+    push (@PARS, $arg);
+  }
+}
+
+my $cnf= Util::JSON::read_json_file ($config_fnm);
+my $ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database');
+my $col_othes_policy= $ot2ut->get_collection('othes.policy');
+# print __LINE__, " ot2ut=[$ot2ut]\n";
+
+PAR: foreach my $fnm (@PARS)
+{
+  update_file($fnm);
+}
+exit(0);
+
+sub update_file
+{
+  my $fnm= shift;
+
+  my @fnm= split('/', $fnm);
+  my $last= pop (@fnm);
+
+  unless ($last eq 'index.html')
+  {
+    print __LINE__, " ATTN: not an index.html: last=[$last]\n";
+    return undef;
+  }
+
+  my @last4= splice(@fnm, -4);
+  my $eprint_id= join('', @last4)+0;
+  # print __LINE__, " last4=[$last4] [",join(':', @last4), "]\n"; return undef;
+
+  my @st= stat($fnm);
+  return undef unless (@st);
+  my $mtime= $st[9];
+
+  my $search= { eprint_id => "$eprint_id" };
+  # print __LINE__, " search: ", Dumper($search);
+  my $policy_rec= $col_othes_policy->find_one($search);
+  # print __LINE__, " policy_rec=[$policy_rec]\n";
+  if (defined ($policy_rec) && $policy_rec->{mtime} == $mtime && !$force)
+  {
+    # print __LINE__, " already processed; skipping\n";
+    # print __LINE__, " ", Dumper($policy_rec);
+    return undef;
+  }
+
+  my $policy_info= process_index_file($fnm);
+
+  $policy_info->{mtime}= $mtime;
+
+  # delete ($policy_info->{document_paths});
+  print __LINE__, " policy_info: ", Dumper($policy_info);
+
+  if (defined ($policy_rec))
+  {
+    my $rc_upd= $col_othes_policy->update( { _id => $policy_rec->{_id} }, $policy_info );
+    print __LINE__, " policy update: rc_upd=[$rc_upd] ", Dumper($rc_upd);
+  }
+  else
+  {
+    if (exists ($policy_info->{eprint_id}))
+    {
+      my $rc_ins= $col_othes_policy->insert( $policy_info );
+      print __LINE__, " policy insert: rc_ins=[$rc_ins] ", Dumper($rc_ins);
+    }
+    else
+    {
+      print __LINE__, " WARNING: eprint_id missing\n";
+    }
+  }
+}
+ 
+sub parse_document_url
+{
+  my $href= shift;
+
+  if ($href =~ m#https?://othes.univie.ac.at/((\d+)/(\d+)/([^/]+))$#)
+  {
+    my ($document_path, $eprint_id, $seq_num, $filename)= ($1, $2, $3, $4);
+
+    return
+    {
+      document_path   => $document_path,
+      document_url    => $href,
+      eprint_id       => $eprint_id,
+      seq_num         => $seq_num,
+      filename        => $filename,
+      is_downloadable => 0,
+    };
+  }
+
+  undef;
+}
+
+sub process_index_file
+{
+  my $fnm= shift;
+
+  my $tree= HTML::TreeBuilder->new; # empty tree
+  $tree->parse_file($fnm);
+
+  # print "Hey, here's a dump of the parse tree=[$tree] of fnm=[$fnm]:\n";
+
+  # $tree->dump; # a method we inherit from HTML::Element
+  # print "And here it is, bizarrely rerendered as HTML:\n", $tree->as_HTML, "\n";
+
+=begin comment
+
+  print "\n\n", '='x70,"\n";
+  print "ALL HTML\n";
+  print '='x70,"\n";
+  $tree->dump();
+  # $tree->as_HTML();
+  print '='x70,"\n";
+
+=end comment
+=cut
+ 
+=begin comment
+
+  my @nodes= $tree->content_list();
+  print __LINE__, " nodes: ", scalar @nodes, "\n";
+  foreach my $node (@nodes)
+  {
+    print __LINE__, " node=[$node]\n";
+    print $node->dump();
+  }
+
+=end comment
+=cut
+
+  my %result=
+  (
+    filename => $fnm,
+    # mtimte => $mtime,
+  );
+  my $cnt_warnings= 0;
+  my %document_paths= ();
+
+  # 1. check meta elements
+  # my @meta_elements= $tree->find_by_tag_name('meta');
+  my @meta_elements= $tree->look_down( _tag => 'meta' );
+  # print __LINE__, " meta_elements: ", scalar @meta_elements, "\n";
+  foreach my $meta_element (@meta_elements)
+  {
+    # print $meta_element->dump();
+    my $name= $meta_element->attr('name');
+    my $content= $meta_element->attr('content');
+    # print __LINE__, " meta: name=[$name] content=[$content]\n";
+
+    if ($name eq 'eprints.document_url')
+    {
+      my $res= parse_document_url($content);
+
+      if (defined ($res))
+      {
+        $document_paths{$res->{document_path}}= $res;
+      }
+      else
+      {
+        push (@{$result{unknown_document_paths_meta}}, $content);
+        $cnt_warnings++;
+      }
+    }
+    elsif ($name =~ m#eprints\.(full_text_status|ispublished|metadata_visibility)#) { my $an= $1; $result{$an}= $content; }
+    elsif ($name eq 'DC.relation')
+    {
+      if ($content =~ m#https?://othes.univie.ac.at/(\d+)/$#)
+      {
+        $result{eprint_id}= $1;
+        $result{othes_url}= $content;
+      }
+    }
+  }
+
+=begin comment
+
+  my @doc_cit= $tree->look_down( _tag => 'span', class => 'ep_document_citation' );
+  foreach my $doc_cit (@doc_cit)
+  {
+    print __LINE__, " doc_cit:\n";
+    print '='x70,"\n";
+    print $doc_cit->dump();
+    # print $doc_cit->as_HTML(), "\n";
+  }
+
+=end comment
+=cut
+
+  my @anchors= $tree->look_down( _tag => 'a' );
+  my %hrefs= ();
+  foreach my $anchor (@anchors)
+  {
+    my $href= $anchor->attr('href');
+    # print __LINE__, " anchor: href=[$href]\n";
+    $hrefs{$href}++;
+
+    # print '='x70,"\n";
+    # print $anchor->dump();
+    # print $div1->as_HTML(), "\n";
+  }
+
+  # print __LINE__, " hrefs: ", Dumper(\%hrefs);
+  foreach my $href (keys %hrefs)
+  {
+    if ($href =~ m#https?://othes.univie.ac.at/((\d+)/(\d+)/([^/]+))$#)
+    {
+      my $res= parse_document_url($href);
+
+      if (defined ($res))
+      {
+        if (exists ($document_paths{$res->{document_path}}))
+        {
+          $document_paths{$res->{document_path}}->{is_downloadable}= 1;
+        }
+        else
+        {
+          push (@{$result{unmatched_document_paths_href}}, $href);
+          $cnt_warnings++;
+        }
+      }
+      else
+      {
+        push (@{$result{unknown_document_paths_href}}, $href);
+        $cnt_warnings++;
+      }
+    }
+    elsif ($href =~ m#https?://resolver.obvsg.at/(urn:nbn:at:.*)#)
+    {
+      $result{urn}= $1;
+      $result{urn_resolver_url}= $href;
+    }
+    elsif ($href =~ m#https?://ubdata.univie.ac.at/(AC\d{8})#)
+    {
+      $result{ac_number}= $1;
+      $result{catalog_url}= $href;
+    }
+    elsif ($href =~ m#https?://doi\.org/(.*)#)
+    {
+      $result{doi}= $1;
+      $result{doi_resolver_url}= $href;
+    }
+  }
+
+  # Now that we're done with it, we must destroy it.
+  # $tree = $tree->delete; # Not required with weak references
+
+  my ($cnt_docs, $cnt_public, $cnt_restricted)= (0, 0, 0);
+  my @document_paths;
+  foreach my $dp (keys %document_paths)
+  {
+    my $x= $document_paths{$dp};
+    if ($x->{is_downloadable}) { $cnt_public++; } else { $cnt_restricted++; }
+    $cnt_docs++;
+  }
+
+  $result{cnt_docs}=       $cnt_docs;
+  $result{cnt_public}=     $cnt_public;
+  $result{cnt_restricted}= $cnt_restricted;
+  $result{cnt_warnings}=   $cnt_warnings;
+  $result{document_paths}= [ values %document_paths ];
+
+  \%result;
+}
+