
Zhris
Enthusiast
Jan 29, 2018, 8:36 AM
Post #2 of 6
(1958 views)
|
Re: [dilbert] a little script that makes use of LWP::Simple
[In reply to]
|
Can't Post
|
|
Hi,
the first step: first i do a view on the page source to find HTML elements? view-source is a browser based command, it tells the browser to output the response in plain text rather than render it based on its actual content type, html in this case. You should not need to include view-source in your url. I have written a little script that extracts the data out of each block and cleans it up a little. The browse function is generic, it takes an input ref which contains the url and xpaths of the parent and children in order to construct the output ref. It is just to give you an idea of an approach I might take, it does not yet navigate across each page, you may want to use it as a basis.
use strict; use warnings FATAL => qw#all#; use LWP::UserAgent; use HTML::TreeBuilder::XPath; use Data::Dumper; my $handler_relurl = sub { q#https://europa.eu# . $_[0] }; my $handler_trim = sub { $_[0] =~ s#^\s*(.+?)\s*$#$1#r }; my $handler_val = sub { $_[0] =~ s#^[^:]+:\s*##r }; my $handler_split = sub { [ split $_[0], $_[1] ] }; my $handler_split_colon = sub { $handler_split->( qr#; #, $_[0] ) }; my $handler_split_comma = sub { $handler_split->( qr#, #, $_[0] ) }; my $conf = { url => q#https://europa.eu/youth/volunteering/evs-organisation_en#, parent => q#//div[@class="vp ey_block block-is-flex"]#, children => { internal_url => [ q#//a/@href#, [ $handler_relurl ] ], external_url => [ q#//i[@class="fa fa-external-link fa-lg"]/parent::p//a/@href#, [ $handler_trim ] ], title => [ q#//h4# ], topics => [ q#//div[@class="org_cord"]#, [ $handler_val, $handler_split_colon ] ], location => [ q#//i[@class="fa fa-location-arrow fa-lg"]/parent::p#, [ $handler_trim ] ], hand => [ q#//i[@class="fa fa-hand-o-right fa-lg"]/parent::p#, [ $handler_trim, $handler_split_comma ] ], pic_number => [ q#//p[contains(.,'PIC no')]#, [ $handler_val ] ], } }; print Dumper browse( $conf ); sub browse { my $conf = shift; my $ref = [ ]; my $lwp_useragent = LWP::UserAgent->new( agent => q#IE 6#, timeout => 10 ); my $response = $lwp_useragent->get( $conf->{url} ); die $response->status_line unless $response->is_success; my $content = $response->decoded_content; my $html_treebuilder_xpath = HTML::TreeBuilder::XPath->new_from_content( $content ); my @nodes = $html_treebuilder_xpath->findnodes( $conf->{parent} ); for my $node ( @nodes ) { push @$ref, { }; while ( my ( $key, $val ) = each %{$conf->{children}} ) { my $xpath = $val->[0]; my $handlers = $val->[1] // [ ]; $val = ($node->findvalues( qq#.$xpath# ))[0] // next; $val = $_->( $val ) for @$handlers; $ref->[-1]->{$key} = $val; } } return $ref; } Output of the first block:
{ 'internal_url' => 'https://europa.eu/youth/volunteering/organisation/948417016_en', 'external_url' => 'http://www.apd.ge', 'location' => 'Tbilisi, Georgia', 'title' => '"Academy for Peace and Development" Union', 'topics' => [ 'Access for disadvantaged', 'Youth (Participation, Youth Work, Youth Policy)', 'Intercultural/intergenerational education and (lifelong)learning' ], 'pic_number' => '948417016', 'hand' => [ 'Receiving', 'Sending' ] } Chris
|