diff --git a/eprints1.pl b/eprints1.pl index ce2b326a81e28dead102a353695130793c0221fc..52175d39be6a38532bc19503c099c31f6b748582 100755 --- a/eprints1.pl +++ b/eprints1.pl @@ -205,6 +205,107 @@ my %bucketlist_column_descriptions= my %ot2ut_sync_anyway= map { $_ => 1 } qw(33905); # these should be synced, even if they were already marked as ok +# fixup names +my $fn_init= 0; +local *FN_lst; +my @fn_lst= qw(ac_number eprint_id column_name name notes1 fixed1 nn vn); +my $fn_cnt= 0; + +# This should be factored out!! +my @first_names= qw( + Adam Adelheid Alejandro Alexa Alexander Alexandra Alfred Alice Alina + Alois Amadou-Lamine Ana András Andre Andrea Andreas Angela Angeles + Ania Anika Anjte Anke Anna Annegret Annemarie Anton Antonio Aprile + Armando Armin Arnd Arno Arnold Arthur Aslan Astrid Awad Axel Barbara + Barbara-Amina Beatrice Berhard Bernd Bernd-Christian Bernhard Berta + Bertrand Bettecke Bettina Birgit Birgitta Björn Bodo Bodomo Boris + Brigitta Brigitte Caecilie Cesare Chris Christa Christian Christiane + Christian-Hubert Christine Christof Christoph Christopher Claire + Claudia Claus Clemens Constanze Dagmar Daiana Daniel Daniela David + Diamantopoulos Diana Dieter Dietmar Dimitris Dirim Donald Doris Dorothea + Dorothée Eberhard Eckhard Edit Eduard Elisabeth Elke Elmar Erhard + Erich Erik Ernst Estella Esther Eugen Eva Eva-Maria Eveline Evrim Ewald + Federico Fedor Ferdinand Floortje Florian France Frank Franz + Franz-Markus Franz-Stefan Friederike Friedhelm Friedrich Frigerio + Fritz Gabriela Gabriele Garcia Georg George Georgios Gerald Gerda + Gerhard Gerit Germain Gernot Gero Gerte Gertraud Gil Gordon Gottfried + Grandner Gualtiero Gunda Gunnar Gunter Günter Günther Gustav Gyongyi + Gyöngyi Hanna Hannes Hanno Hans Hans-Georg Hans-Jürgen Hanspeter + Harald Heidemarie Heiner Heinrich Heinz Helen Helga Helmut Helmuth + Henk Henning Henry Herbert Hermann Hermine Herwig Hilde Hildegard + Horst Hristov Iacopo Igor Ilona Ilse Ines Ingeborg Ingfrid Ingo Ingrid Irene + Irmgard Irmtraud Isabella Isabelle Item Jadranka Jakiša Jan Jan-Heiner + Jens Jiří Joan Joao Johann Johanna Johannes Jörg Jorinde Josef + Josipovic József Judith Julia Julius Jürg Jürgen Jutta Karel Karen + Karin Karl Karl-Heinz Katharina Katharine Kathrin Katja Katrin Kerstin + Kim Kirsten Klara Klára Klaus Klaus-Börge Klaus-Dieter Klemens Konrad + Konstanze Korina Kornelia Kristina Kurt Larisa Leopold Lieselotte Lorenz + Lothar Ludger Lukas Lydia Maciel Magdalena Majchrzak Manfred Manuela + Marc Marcello Marco Margareta Margarete Margareth Margaretha Margit + Maria Marianne Marie-France Marija Mario Marion Marko Markus Martin + Martina Mathilde Matthias Maximilian Melissa Meta Michael Michaela + Michaela-Maria Michal Michela Michele Michèle Michelle Milena Mira + Miranda Moga Mona Monika Monje Nadine Nadja Natalia Nicole Nikolaus + Norbert Nuno Oliver Oskar Oskár Otmar Otto Patricia Patrick Paul Paulus + Peter Petra Philip Philipp Pia Qi Rainer Ralf Ramon Ramón Raphael + Regina Regine Reingard Reinhard Reinhold Renate René Richard Robert + Robin Roland Ronald Rosa Rüdiger Rüdiger Rudolf Rupert Ruth Sabina + Sabine Sami Sandra Sarah Sascha Saskia Savvas Sebastian Segeja Serge + Sergey Sergio Siegfried Sieglinde Sigmar Sigrid Simon Sonia Sonja + Sophie Stefan Stefanie Stefan-Michael Steffen Stephan Stephanie Susanne + Susi Suzanne Sylvia Tamara Taťána Tatjana Tecumseh Theodoros Thierry + Thomas Timothy Tobias Tom Udo Ulf Ulrich Ulrike Urs Ursula Ute Verena + Veronika Viera Viktor Vincente Violetta Vittorio Vlastimil Waldemar Walter + Walther Werner Wieland Wilhelm Wolfgang Wolfram Wynfrid Yan Yulia +); + +my %first_names= map { $_ => 1 } @first_names; + +my %special_name_mapping= +( + 'THEIS Lioba' => { fn => 'Theis, Lioba' }, + 'Univ. Prof in. Dr. in med.univ. Margarethe Geiger' => { fn => 'Geiger, Margarethe' }, + 'Mag. Dr, Manfred Glauninger, Privatdoz.' => { fn => 'Glauninger, Manfred' }, + 'Dipl.-Ing. Dr. -Ing. Henry Jäger' => { fn => 'Jäger, Henry' }, + 'Frau Assoz. Profin. MMaga. DDrin. Esther Ramharter' => { fn => 'Ramharter, Esther' }, + 'ao. Univ.-Prof. tit. Univ.-Prof. Dipl.-Ing. Dr. Erich Schikuta' => { fn => 'Schikuta, Erich' }, + 'Pr. Dr. Arthur Rachwald' => { fn => 'Rachwald, Arthur' }, + 'Dr. Manuel.D Montaño' => { fn => 'Montaño, Manuel D.' }, + 'V.-Prof. Doz. Dr. Marie- France Chevron' => { fn => 'Chevron, Marie-France' }, + 'Mgr. Michal Dovrecký. PhD' => { fn => 'Dovrecký, Michal' }, + 'ao. Univ.-Prof. Dipl.-Geol. Dr. Christa-Ch. Hofmann' => { fn => 'Hofmann, Christa-Ch.' }, + 'Bago- Pilátová Martina' => { fn => 'Bago-Pilátová, Martina' }, + 'Ania-Martinez Ana Begona' => { fn => 'Ania-Martinez, Ana Begona' }, + 'DR. DR. AMADOU-LAMINE SARR' => { fn => 'Sarr, Amadou-Lamine' }, + 'em. Univ. Prof .DDr. Paul Michael Zulehner' => { fn => 'Zulehner, Paul Michael' }, + 'Hennig Schluß' => { fn => 'Schluß, Henning' }, + 'Univ. Lektor Mag. Dr. Christian Sitte' => { fn => 'Schluß, Henning' }, + 'RACHWALD ARTHUR' => { fn => 'Rachwald, Arthur R.' }, + 'Professor Dr. Arthur R. Rachwald' => { fn => 'Rachwald, Arthur R.' }, + 'Univ. Lektor Mag. Dr. Christian Sitte' => { fn => 'Sitte, Christian' }, + 'Univ.-Prof. Dr. Mathes, Klaus-Dieter, Privatdoz. M.A.' => { fn => 'Mathes, Klaus-Dieter' }, + 'Univ.-Prof.MMag DDr. Rupert Klieber' => { fn => 'Klieber, Rupert' }, + 'Em.Univ.-Prof.Dipl.Soz.Dr. Paul Kellermann' => { fn => 'Kellermann, Paul' }, + 'Univ.-Ass. Mag. Dr. Matthias Flatscher' => { fn => 'Flatscher, Matthias' }, + 'Univ.-Prof. Dipl-Ing. Dr. Arnold Baca' => { fn => 'Baca, Arnold' }, + 'PhD Savvas Stafylidis' => { fn => 'Stafylidis, Savvas' }, + 'Dipl.-Ernährungswiss. Dr. Barbara Lieder' => { fn => 'Lieder, Barbara' }, + 'Weber, Gerhard, W.' => { fn => 'Weber, Gerhard W.' }, + 'Dr. Martin Melzer, LL.M., TEP' => { fn => 'Melzer, Martin' }, + 'Monje Quiroga Francisco' => { fn => 'Francisco, Monje Quiroga' }, + 'Univ.-Prof. MMMMag. Dr. Pokorny, Lukas, M.A.' => { fn => 'Pokorny, Lukas' }, + 'e.m. a.o. Unif.-Prof. Dr. Olaf Bockhorn' => { fn => 'Bockhorn, Olaf' }, + 'o. Univ.-Prof. DDr. Ludger Müller M.A.' => { fn => 'Müller, Ludger' }, + 'WEISS Friedl' => { fn => 'Weiss, Friedl' }, + 'Priv. Uni. Doz. Mag. DDr. Julia Wippersberg' => { fn => 'Wippersberg, Julia' }, + 'Univ.-Prof Mag. Dr. Gerhard Budin' => { fn => 'Budin, Gerhard' }, + 'Uni.-Prof. Dr. Ulrich Teichler' => { fn => 'Teichler, Ulrich' }, + 'Ao. Univ.-Porf. Dr. REINPRECHT Christoph' => { fn => 'Reinprecht, Christoph' }, + 'ao. Univ.-Prof. Dr. Fritz (Friedrich) Hausjell' => { fn => 'Hausjell, Friedrich' }, # laut ufind ist der Vorname "Friedrich", siehe https://ufind.univie.ac.at/en/person.html?id=1872 + +# 'PD DDr Wippersberg Julia' => { fn => 'Wippersberg, Julia' }, +); + my %doc_embargo_dates; my $base_path= '/var/www/ot2ut'; # TODO(maybe): get this from the config... # END OT2UT: Othes to Utheses @@ -291,9 +392,10 @@ while (defined ($arg= shift (@ARGV))) my $sleep_urn_request= 3; my $running= 1; $SIG{INT}= sub { $running= 0; }; +$SIG{USR1}= sub { $running= 0; }; my $serving_requests= 1; -$SIG{USR1}= sub { $serving_requests= 0; }; +$SIG{USR2}= sub { $serving_requests= 0; }; # Agent mode my $db_ot2ut; @@ -2849,7 +2951,7 @@ sub ot2ut push (@synced, $el); $col_sync->insert($el); $cnt_errors_upload++; - sleep(2); + # sleep(2); } else { @@ -2897,7 +2999,7 @@ sub ot2ut send_message("upload $upload_success: eprint_id=[$eprint_id] eprint_status=[$eprint_status] lastmod=[$lastmod] context=[$ot2ut_context] utheses_id=[$utheses_id] time_total=$td_start time_upload=$td_curl") unless ($silent_upload_success && $upload_success eq 'ok'); } - sleep(2); + # sleep(2); if (defined ($utheses_id)) { @@ -3714,9 +3816,152 @@ mysql> select count(*), abstract_nicht_anzeigen from eprint group by abstract_ni (\@errors, \@warnings, \%thesis); } -sub cleanup_name +sub fixup_name { - my $s= shift; + my $c= shift; + + unless ($fn_init) + { + open(FN_lst, '>:utf8', 'fixup_names.tsv'); + print FN_lst join("\t", @fn_lst), "\n"; + $fn_init= 1; + } + + # let the generic fixups begin + my @n; + my $f= $c->{name}; + + my $info; + + my ($rc, $nn, $vn); + + if (exists ($special_name_mapping{$f})) + { + my $x= $special_name_mapping{$f}; + $f= $x->{fn} if (exists ($x->{fn})); + $rc= 'special'; + } + else + { + push (@n, 'aoup') if ($f=~ s#\ba\.o\s+univ\.\s*prof\.\s*##i); + push (@n, 'hr') if ($f=~ s#\bhr\.\s*##i); + push (@n, 'tit') if ($f=~ s#\btit\.\s*##i); + push (@n, 'ao1') if ($f=~ s#\ba\.\s*\bo\.\s*##i); + push (@n, 'ao1') if ($f=~ s#\bao\s+##i); + push (@n, 'ao') if ($f=~ s#\bao\.\s*##i); + push (@n, 'ir') if ($f=~ s#\bi\.\s*r\.\s*##i); + push (@n, 'o') if ($f=~ s#\bo\.\s*##i); + push (@n, 'frau') if ($f=~ s#\bfrau\s+##i); + push (@n, 'emer') if ($f=~ s#\bemer\.\s*##i); + push (@n, 'profin')if ($f=~ s#\bprofin\.\s*##i); + push (@n, 'em2') if ($f=~ s#\be\.m\.\s*##i); + push (@n, 'em3') if ($f=~ s#\bem\.\s*##i); + push (@n, 'di') if ($f=~ s#\bdipl\.[\-\s]*ing\.(in)?\s*##i); + push (@n, 'dpsy') if ($f=~ s#\bdipl\.[\-\s]*psych\.\s*##i); + push (@n, 'dgeog') if ($f=~ s#\bdipl\.[\-\s]*geogr\.\s*##i); + push (@n, 'dgeol') if ($f=~ s#\bdipl\.[\-\s]*geol\.\s*##i); + push (@n, 'dtheo') if ($f=~ s#\bdipl\.[\-\s]*theol\.\s*##i); + push (@n, 'dbio') if ($f=~ s#\bdipl\.[\-\s]*biol\.\s*##i); + push (@n, 'dp') if ($f=~ s#\bdipl\.[\-\s]*p[äÄ]d\.\s*##i); + push (@n, 'vp') if ($f=~ s#\bv\.[\-\s]*prof\.\s*##i); + push (@n, 'hp') if ($f=~ s#\bhon\.[\-\s]*prof\.\s*##i); + push (@n, 'up') if ($f=~ s#\buniv\.?[\-\s]*prof\.(in)?\s*##i); + push (@n, 'upt') if ($f=~ s#\buniv\.[\-\s]*porf\.\s*##i); # typo ... + push (@n, 'apin') if ($f=~ s#\bass\.[\-\s]*prof\.in\s*##i); + push (@n, 'ap') if ($f=~ s#\bass\.[\-\s]*prof\.\s*##i); + push (@n, 'acp') if ($f=~ s#\bassoc\.[\-\s]*prof\.(in)?\s*##i); + push (@n, 'azp') if ($f=~ s#\bassoz\.[\-\s]*prof\.(in)?\s*##i); + push (@n, 'pdoz') if ($f=~ s#\bpriv\.[\s\-]*doz\.\s*##i); + push (@n, 'udoz') if ($f=~ s#\buniv\.[\s\-]*doz\.\s*##i); + push (@n, 'uass') if ($f=~ s#\buniv\.[\s\-]*ass\.\s*##i); + push (@n, 'ing') if ($f=~ s#\bing\.\s*##i); + push (@n, 'hr') if ($f=~ s#\bhr\s+##i); + push (@n, 'pd') if ($f=~ s#\bpd\s+##i); + push (@n, 'ra') if ($f=~ s#\bra\s+##i); + push (@n, 'der') if ($f=~ s#\bder\.\s*##i); + push (@n, 'doz') if ($f=~ s#\bdoz\.\s*##i); + push (@n, 'p') if ($f=~ s#\bprof\.\s*##i); + push (@n, 'mgr') if ($f=~ s#\bmgr\.a\s+##i); + push (@n, 'maga') if ($f=~ s#\bmag\.a\s+##i); + push (@n, 'mmag') if ($f=~ s#\bmmag\.\s*##i); + push (@n, 'mag') if ($f=~ s#\bmag\.\s*##gi); + push (@n, 'rndr') if ($f=~ s#\brndr\.\s*##i); + push (@n, 'csc') if ($f=~ s#\bcsc\.\s*##i); + push (@n, 'phdr') if ($f=~ s#\bphdr\.\s*##i); + push (@n, 'ddr') if ($f=~ s#\bddr\.\s*##i); + push (@n, 'ddr2') if ($f=~ s#\bddr\b\s*##i); + push (@n, 'ddr2') if ($f=~ s#\bpd\b\s*##i); + push (@n, 'drin') if ($f=~ s#\bdr\.in\s*##gi); + push (@n, 'dra') if ($f=~ s#\bdr\.a\s*##gi); + push (@n, 'drt') if ($f=~ s#\bdr\.\s*techn\.\s*##gi); + push (@n, 'pddr') if ($f=~ s#\bpd\s+dr\.\s*##gi); + push (@n, 'hdr') if ($f=~ s#\bhofratdr\.\s*##gi); + push (@n, 'dr') if ($f=~ s#\bdr[\.\s]+##gi); + push (@n, 'habil') if ($f=~ s#\bhabil\.\s*##i); + push (@n, 'rer') if ($f=~ s#\brer\.\s*##i); + push (@n, 'nat') if ($f=~ s#\bnat\.\s*##i); + push (@n, 'soc') if ($f=~ s#\bsoc\.\s*##i); + push (@n, 'oec') if ($f=~ s#\boec\.\s*##i); + push (@n, 'phil') if ($f=~ s#\bphil\.\s*##i); + push (@n, 'med') if ($f=~ s#\bmed\.\s*##i); + push (@n, 'pharm') if ($f=~ s#\bpharm\.\s*##i); + push (@n, 'u') if ($f=~ s#\buniv\.\s*##i); + push (@n, 'hc1') if ($f=~ s#\bh\.c\.\s*##i); + push (@n, 'hc2') if ($f=~ s#\bhc\.\s*##i); + push (@n, 'phd1') if ($f=~ s#\bph\.d\.\s*##gi); + push (@n, 'ma') if ($f=~ s#[,\s]+ma\s*$##i); + push (@n, 'mas') if ($f=~ s#[,\s]+mas\s*$##i); + push (@n, 'bakk') if ($f=~ s#[,\s]+bakk\.?\s*$##i); +# push (@n, 'llm') if ($f=~ s#[,\s]+llm\.\s*$##i); + push (@n, 'llm') if ($f=~ s#[,\s]+ll\.?m\.?\s*$##i); + push (@n, 'phd2') if ($f=~ s#,\s*ph\.?d\.\s*##i); + push (@n, 'pdoz2') if ($f=~ s#,\s*privatdoz\.\s*##i); + } + + $c->{fixed1}= $f; + + if (defined ($info)) + { + } + else + { + ($nn, $vn)= split(/\s*,\s*/, $f, 2); + if ($vn) + { + $c->{nn}= $nn; + $c->{vn}= $vn; + push (@n, 'csv'); + $rc= 'split' unless (defined ($rc)); + } + else + { + my @f= split(' ', $f); + my (@vn, @nn); + foreach my $n (@f) + { + if ($n =~ m#^[A-Z]\.$# || exists ($first_names{$n})) { push (@vn, $n) } else { push (@nn, $n); } + } + + unless (@nn) + { # if there are only firstnames, the last one will be the family name (just guessing)... + push (@nn, pop(@vn)); + push (@n, 'fn-only'); + } + + $nn= $c->{nn}= join(' ', @nn); + $vn= $c->{vn}= join(' ', @vn); + push (@n, 'fn-picker'); + + $rc= 'picked' unless (defined ($rc)); + } + } + + $c->{notes1}= join(',', @n); + + print FN_lst join("\t", map { $c->{$_} } @fn_lst), "\n"; + $fn_cnt++; + + return ($rc, $nn, $vn); } sub get_names_for_role @@ -3739,15 +3984,28 @@ sub get_names_for_role my @names= split (/\s*;\s*/, $names); foreach my $name (@names) { - my ($nn, $vn)= split (/\s*,\s*/, $name); + $name=~ tr/\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}/-/s; # hyphen, non-breaking hyphen, figure dash, en-dash, em-dash + $name=~ s/ +/ /g; + my ($nn, $vn)= split (/\s*,\s*/, $name, 2); $nn =~ s/^\s*//; $vn =~ s/\s*$//; print __LINE__, " column_name=[$column_name] name=[$name] nn=[$nn] vn=[$vn]\n"; - if ($vn eq '' || !($vn =~ m#^\U\E[\w\-\x{2010}\. ]+?$#) || !($nn =~ m#^\U\E[\w\-\x{2010} ]+$#)) + if ($vn eq '' || !($vn =~ m#^\U\E[\w\-\'\.<>() ]+$#) || !($nn =~ m#^\U\E[\w\-\'\. ]+$#)) { # TODO: add option to flag this as a warning instead of as an error - push (@errors, { error => 'bad_name', column_name => $column_name, name => $name } ); - push (@result, { family_name => $name }); # fill everything in into family_name + + my ($rc, $nn1, $vn1)= fixup_name({ name => $name, ac_number => $row->{ac_nummer}, eprint_id => $row->{eprintid}, column_name => $column_name }); + print __LINE__, " fixup_name: rc=[$rc] nn1=[$nn1] vn1=[$vn1]\n"; + + if (defined ($rc) && ($rc eq 'special' || $rc eq 'split' || $rc eq 'picked')) + { + push (@result, { family_name => $nn1, given_name => $vn1 }); + } + else + { + push (@errors, { error => 'bad_name', column_name => $column_name, name => $name } ); + push (@result, { family_name => $name }); # fill everything in into family_name + } } else { @@ -4096,6 +4354,8 @@ sub update_policies my %upd_eprint_ids= map { $_ => 1 } @upd_eprint_ids; + %doc_embargo_dates= (); # global variable, clear hash from data from previous runs + my $epr= get_eprints_db($cnf); # print "epr: ", Dumper ($epr); @@ -4227,6 +4487,8 @@ sub policies_stats { my $msg= shift; + %doc_embargo_dates= (); # global variable, clear hash from data from previous runs + $db_ot2ut= IRMA::db::get_any_db($cnf, 'ot2ut_database') unless (defined ($db_ot2ut)); # prepare: get info from sync database @@ -4583,7 +4845,7 @@ sub policies_stats <html> <head> <meta charset="UTF-8" /> -<meta refresh="600" /> +<meta http-equiv="refresh" content="300" /> <title>othes to utheses migration statistics</title> <style> td { text-align:right; } @@ -4755,7 +5017,7 @@ EOX { my $c= $totals{$context}->{cnt_ok}; my $e= $totals{$context}->{cnt_error}; - my $pct= $c*100.0/$total_othes; + my $pct= ($total_othes == 0) ? 0 : $c*100.0/$total_othes; my $ck1= ($pct == 100.0) ? 'lightgreen' : 'lightblue'; my $ck2= ($e == 0) ? 'lightgreen' : 'lightpink'; printf IDX (" <td bgcolor=\"$ck1\">%d</td><td bgcolor=\"$ck1\">%5.2f %%</td><td bgcolor=\"$ck2\">%d</td>\n", $c, $pct, $e); @@ -4776,6 +5038,7 @@ print IDX <<"EOX"; </body> </html> EOX + close(IDX); # show objects which were uploaded to utheses but are no longer present in othes my @synced_not_found= sort { $a <=> $b } keys %synced;