diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..40ea0500a93803f1a1ac3449f474a89ac9c9971f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.swp +@* diff --git a/scraper/htb.pl b/scraper/htb.pl new file mode 100755 index 0000000000000000000000000000000000000000..b1649f37405780bd1209169f8e60b90d2e2caf17 --- /dev/null +++ b/scraper/htb.pl @@ -0,0 +1,30 @@ +#!/usr/bin/perl + +use strict; + +use FileHandle; +use HTML::TreeBuilder; +use Data::Dumper; +$Data::Dumper::Indent= 1; + +binmode STDOUT, ':utf8'; + +my $t1= <<"EOX"; +<html> + <head> + <title>bla</title> + </head> + <body> + <p ID="par1">Pargraph1</p> + <p ID="par2">Pargraph2 Universit\x{e4}t Wien</p> + <p ID="par3">Pargraph3</p> + <p ID="par4">Pargraph4</p> + </body> +</html> +EOX + +my $htb1= HTML::TreeBuilder->new_from_content($t1); + +print "htb1: ", Dumper ($htb1); +my @x= $htb1->look_down (_tag => 'p'); +print "x: ", Dumper (\@x);