From: Michael Howe Date: Tue, 8 Apr 2014 22:38:38 +0000 (+0100) Subject: Whole bunch of tidying, including: X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=33a8b94c74dfa6423be47d5b7a89e0fb79295bdf;p=pub%2Fmichael%2Fnational-rail-ticket-split.git Whole bunch of tidying, including: * rename to the more sensible split-route.pl * documentation * debug option - by default it's quiet --- diff --git a/parse-route.pl b/parse-route.pl deleted file mode 100755 index 63ebd06..0000000 --- a/parse-route.pl +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use WWW::Mechanize; -use HTML::TreeBuilder; - -use Data::Dumper; -use List::Util qw(sum); -use Graph; - -use 5.14.0; - -my $mech = WWW::Mechanize->new(); - -my $query_site = "http://traintimes.org.uk"; - -die "Usage: $0 \n" - unless( scalar( @ARGV ) == 4 ); - -my ( $src_in, $dest_in, $date, $depart ) = @ARGV; - -say "Finding routes from $src_in to $dest_in on $date at $depart"; - -my $url = "${query_site}/${src_in}/${dest_in}/${depart}/${date}"; -say "Checking $url"; - -# This is the overall route -my $route = get_stops( $url ); - -print Dumper( $route ); - -# Give me some more useful variables. Note that we need the array because we need the order. -my @all_stops = map { $_->{abbr} } @{$route->{stops}}; -my %stop_details = map { $_->{abbr} => { departs => $_->{departs}, arrives => $_->{arrives}, point => $_->{point} } } @{$route->{stops}}; - -print Dumper( \%stop_details ); -my $graph = Graph->new(); -my %journeys; - -$graph->add_weighted_edge( $all_stops[0], $all_stops[-1], $route->{price} ); -get_substops( \@all_stops ); - -print Dumper( \%journeys ); - -my @best_route = $graph->SP_Dijkstra( $all_stops[0], $all_stops[-1] ); - -say "Best route:"; -my $sum = 0; -for(my $i = 0; $i < $#best_route; $i++ ){ - my $journey = $journeys{$best_route[$i]}->{$best_route[$i+1]}; - printf "%s -> %s: %s (%s)\n", $best_route[$i], $best_route[$i+1], $journey->{price}, $journey->{type}; - $sum += $journey->{price}; -} -say "Total price: $sum"; -say "Direct price: " . $route->{price}; - -## -## functions below here -sub get_stops { - my ( $url ) = @_; - say "get_stops( $url )"; - - $mech->get( $url ); - - my $tree = HTML::TreeBuilder->new_from_content( $mech->content ); - - my $first_stop = $tree->look_down( '_tag', 'li', 'id', 'result0' ); - my $details_url = $first_stop->look_down( '_tag', 'a', 'class', 'calling_link' )->attr('href'); - my $ticket_type = $first_stop->look_down( '_tag', 'span', 'class', 'fare-type tooltip')->look_down('_tag', 'a')->as_text; - my $ticket_price = ( $first_stop->look_down( '_tag', 'label' ) )[0]->as_text; - - $tree = $tree->delete(); - - # tidy up - $ticket_price =~ s{[^0-9.]}{}g; - $ticket_type =~ s{^\s+}{}; - - say "Ticket type: $ticket_type"; - say "Cost: $ticket_price"; - - my $stops = _parse_stopping_points( "$query_site/$details_url" ); - my %return = ( - stops => $stops, - type => $ticket_type, - price => $ticket_price, - ); - return \%return; -} - -sub _parse_stopping_points { - my ( $url ) = @_; - - my @stops; - - my $mech = WWW::Mechanize->new(); - - $mech->get( $url ); - - my $tree = HTML::TreeBuilder->new_from_content( $mech->content ); - - # intermediate stops - my $calling_points_div = $tree->look_down( '_tag', 'div', 'class', 'calling' ); - my @rows = $calling_points_div->look_down( '_tag', 'tr' ); - foreach my $row ( @rows ){ - my $point_ent = $row->look_down('_tag', 'td', 'class', 'calling-points'); - next unless( $point_ent ); - my ( $point, $abbr ) = $point_ent->as_text() =~ m{(.*)\s\[([A-Z]+)\]}; - my $arrives = $row->look_down( '_tag', 'td', 'class', 'arrives' )->as_text(); - my $departs = $row->look_down( '_tag', 'td', 'class', 'departs' )->as_text(); - say "$point ($abbr): Arrives: $arrives; departs $departs"; - my %res = ( - arrives => $arrives, - departs => $departs, - point => $point, - abbr => $abbr, - ); - push @stops, \%res; - } - - # first and last stops - my $srcdest = $tree->look_down( '_tag', 'div', 'id', 'content' )->look_down('_tag', 'h2')->as_text; - my ( $departs, $arrives ) = $tree->look_down( '_tag', 'div', 'id', 'content' )->look_down('_tag', 'p')->as_text() =~ m{Leaving\sat\s(\d+:\d+),\sarriving\sat\s(\d+:\d+)}x; - - my ( $src_station, $src_abbr, $dest_station, $dest_abbr ) = - $srcdest =~ m{\s*(.+)\s+\[([A-Z]+)\]\s*to\s*(.+)\s+\[([A-Z]+)\]}; - say "Source: $src_station, $src_abbr at $departs"; - say "Dest: $dest_station, $dest_abbr at $arrives"; - push @stops, { arrives => $arrives, point => $dest_station, abbr => $dest_abbr }; - unshift @stops, { departs => $departs, point => $src_station, abbr => $src_abbr }; - - $tree = $tree->delete(); - return \@stops; -} - -sub get_substops { - my ( $stop_list ) = @_; - - for( my $i = 0; $i <= $#{$stop_list}; $i++ ){ - for( my $j = $i + 1; $j <= $#{$stop_list}; $j++ ){ - # skip src-dest as we've already calculated that - next if( 0 == $i and $#{$stop_list} == $j ); - my $src = $stop_list->[$i]; - my $dst = $stop_list->[$j]; - - say "checking $src -> $dst ($i -> $j)"; - - my $depart = $stop_details{$src}->{departs}; - my $url = "${query_site}/${src}/${dst}/${depart}/${date}"; - my $journey = get_stops( $url ); - - $graph->add_weighted_edge($src, $dst, $journey->{price} ); - $journeys{$src}->{$dst} = $journey; - } - } -} diff --git a/split-route.pl b/split-route.pl new file mode 100755 index 0000000..3000c28 --- /dev/null +++ b/split-route.pl @@ -0,0 +1,271 @@ +#!/usr/bin/perl +use strict; +use warnings; +use 5.10.0; # pull in say + +=head1 NAME + +split-route.pl - parse a train route and attempt to find split tickets + +=head1 SYNOPSIS + +split-route.pl [--debug] [--help|--man] SRC DEST DATE TIME + +=head1 DESCRIPTION + +Given two stations, SRC and DEST, and a train travelling between them on DATE +at TIME, this script will scrape L to find all the +possible intermediate stations at which the journey could be broken, and then +calculate the price of making all those journeys. + +=head1 CAVEATS + +This script assumes that the first train returned by a search for the given +date and time between the given stations will be direct. Bad things will +happen if it is not (also, split tickets may do odd things). + +=head1 STATION IDENTIFIERS + +Either station names (eg 'Oxford', 'Birmingham New Street') or 3-letter codes +(eg 'OXF', 'BHM') can be used. + +=head1 OPTIONS + +=over 4 + +=item --debug + +Turns on debugging; will print details about each route. + +=item --help + +Gives a short help + +=item --man + +Shows this manpage. + +=back + +=head1 EXAMPLE USAGE + + split-route.pl OXF BHM 2014-05-16 17:00 + +=cut + +use WWW::Mechanize; +use HTML::TreeBuilder; + +use Data::Dumper; +use List::Util qw(sum); +use Graph; +use Getopt::Long; +use Pod::Usage; + +my $debug = 0; +my ( $help, $man ); + +GetOptions( "debug" => \$debug, + "help" => \$help, + "man" => \$man, + ); + +pod2usage(1) if( $help ); +pod2usage( -verbose => 2 ) if( $man ); + +unless( scalar( @ARGV ) == 4 ){ + pod2usage( "Invalid number of arguments passed (use --help for more details)" ); +} + +my ( $src_in, $dest_in, $date, $depart ) = @ARGV; + +my $mech = WWW::Mechanize->new(); + +my $query_site = "http://traintimes.org.uk"; + +say "Finding routes from $src_in to $dest_in on $date at $depart"; + +my $url = "${query_site}/${src_in}/${dest_in}/${depart}/${date}"; + +# Hash to hold all journeys, of the form: +# SRCSHORTCODE => { DESTSHORTCODE1 => { stops => [], price => 'price', type => 'type' }, +# DESTSHORTCODE2 => { stops => [], price => 'price', type => 'type }, +# }, +my %journeys; + + +# This is the overall route +my $route = get_stops( $url ); + +# Add the overall route to the journeys list, otherwise if it's cheapest the +# summary doesn't have its details. +$journeys{$route->{stops}[0]->{abbr}}->{$route->{stops}[-1]->{abbr}} = $route; + +debug( "Route:", Dumper( $route ) ); + +# Give us some more useful variables. Note that we need the array because we +# need the order. +# all_stops is simply a list of stops, in order +my @all_stops = map { $_->{abbr} } @{$route->{stops}}; +# stop_details is a hash of: +# SHORTCODE => { point => 'station name', +# arrives => 'arrival_time', +# departs => 'departure_time', +# }, +# to make looking things up by shortcode so much easier. +# Note that this is based on the original route. +my %stop_details = map { $_->{abbr} => { departs => $_->{departs}, arrives => $_->{arrives}, point => $_->{point} } } @{$route->{stops}}; + +debug( "Stop details:", Dumper( \%stop_details ) ); + +my $graph = Graph->new(); + +# Add the source-dest journey +$graph->add_weighted_edge( $all_stops[0], $all_stops[-1], $route->{price} ); + +# Find all suitable combinations of substops +# This populates the graph nodes +get_substops( \@all_stops ); + +debug( "Journeys:", Dumper( \%journeys ) ); + +# Find the single best route +my @best_route = $graph->SP_Dijkstra( $all_stops[0], $all_stops[-1] ); + +say "Cheapest option:"; +my $sum = 0; +if ( 2 == @best_route ){ + say "[direct price]"; +} +for(my $i = 0; $i < $#best_route; $i++ ){ + my $journey = $journeys{$best_route[$i]}->{$best_route[$i+1]}; + my $source_station = $stop_details{$best_route[$i]}->{point}; + my $dest_station = $stop_details{$best_route[$i+1]}->{point}; + my $departs = $journey->{stops}->[0]->{departs}; + my $arrives = $journey->{stops}->[-1]->{arrives}; + printf "%s (%s) -> %s (%s): %s -> %s: £%s (%s)\n", $source_station, $best_route[$i], $dest_station, $best_route[$i+1], $departs, $arrives, $journey->{price}, $journey->{type}; + $sum += $journey->{price}; +} +printf "Total price: £%.2f\n", $sum; +printf "Direct price: £%.2f\n", $route->{price}; + +## +## functions below here +# Get ticket price, type and stops from a URL +sub get_stops { + my ( $url ) = @_; + debug( "get_stops( $url )" ); + + $mech->get( $url ); + + my $tree = HTML::TreeBuilder->new_from_content( $mech->content ); + + my $first_stop = $tree->look_down( '_tag', 'li', 'id', 'result0' ); + my $details_url = $first_stop->look_down( '_tag', 'a', 'class', 'calling_link' )->attr('href'); + my $ticket_type = $first_stop->look_down( '_tag', 'span', 'class', 'fare-type tooltip')->look_down('_tag', 'a')->as_text; + my $ticket_price = ( $first_stop->look_down( '_tag', 'label' ) )[0]->as_text; + + $tree = $tree->delete(); + + # tidy up + $ticket_price =~ s{[^0-9.]}{}g; + $ticket_type =~ s{^\s+}{}; + + debug( "Ticket type: $ticket_type" ); + debug( "Cost: $ticket_price" ); + + my $stops = _parse_stopping_points( "$query_site/$details_url" ); + my %return = ( + stops => $stops, + type => $ticket_type, + price => $ticket_price, + ); + return \%return; +} + +# Get stops from a given URL +sub _parse_stopping_points { + my ( $url ) = @_; + + my @stops; + + my $mech = WWW::Mechanize->new(); + + $mech->get( $url ); + + my $tree = HTML::TreeBuilder->new_from_content( $mech->content ); + + # intermediate stops + my $calling_points_div = $tree->look_down( '_tag', 'div', 'class', 'calling' ); + my @rows = $calling_points_div->look_down( '_tag', 'tr' ); + foreach my $row ( @rows ){ + my $point_ent = $row->look_down('_tag', 'td', 'class', 'calling-points'); + next unless( $point_ent ); + my ( $point, $abbr ) = $point_ent->as_text() =~ m{(.*)\s\[([A-Z]+)\]}; + my $arrives = $row->look_down( '_tag', 'td', 'class', 'arrives' )->as_text(); + my $departs = $row->look_down( '_tag', 'td', 'class', 'departs' )->as_text(); + debug( "$point ($abbr): Arrives: $arrives; departs $departs" ); + my %res = ( + arrives => $arrives, + departs => $departs, + point => $point, + abbr => $abbr, + ); + push @stops, \%res; + } + + # first and last stops + my $srcdest = $tree->look_down( '_tag', 'div', 'id', 'content' )->look_down('_tag', 'h2')->as_text; + my ( $departs, $arrives ) = $tree->look_down( '_tag', 'div', 'id', 'content' )->look_down('_tag', 'p')->as_text() =~ m{Leaving\sat\s(\d+:\d+),\sarriving\sat\s(\d+:\d+)}x; + + my ( $src_station, $src_abbr, $dest_station, $dest_abbr ) = + $srcdest =~ m{\s*(.+)\s+\[([A-Z]+)\]\s*to\s*(.+)\s+\[([A-Z]+)\]}; + debug( "Source: $src_station, $src_abbr at $departs" ); + debug( "Dest: $dest_station, $dest_abbr at $arrives" ); + push @stops, { arrives => $arrives, point => $dest_station, abbr => $dest_abbr }; + unshift @stops, { departs => $departs, point => $src_station, abbr => $src_abbr }; + + $tree = $tree->delete(); + return \@stops; +} + +# Walk a list of stops and work out their cost, then populate the graph +sub get_substops { + my ( $stop_list ) = @_; + + for( my $i = 0; $i <= $#{$stop_list}; $i++ ){ + for( my $j = $i + 1; $j <= $#{$stop_list}; $j++ ){ + # skip src-dest as we've already calculated that + next if( 0 == $i and $#{$stop_list} == $j ); + my $src = $stop_list->[$i]; + my $dst = $stop_list->[$j]; + + debug( "checking $src -> $dst ($i -> $j)" ); + + my $depart = $stop_details{$src}->{departs}; + my $url = "${query_site}/${src}/${dst}/${depart}/${date}"; + my $journey = get_stops( $url ); + # Sanity check departure and arrival times match up with the source journey + my $expected_depart = $stop_details{$src}->{departs}; + my $actual_depart = $journey->{stops}[0]{departs}; + unless( $expected_depart eq $actual_depart ){ + die "Error: train $src -> $dst does not depart at the expected time! (Expected $expected_depart, actual $actual_depart)"; + } + my $expected_arrive = $stop_details{$dst}->{arrives}; + my $actual_arrive = $journey->{stops}[-1]{arrives}; + unless( $expected_arrive eq $actual_arrive ){ + die "Error: train $src -> $dst does not arrive at the expected time! (Expected $expected_arrive, actual $actual_arrive)"; + } + + $graph->add_weighted_edge($src, $dst, $journey->{price} ); + $journeys{$src}->{$dst} = $journey; + } + } +} + +# Simple debugging function +sub debug { + if( $debug ){ + print join( "\n", @_ ); + } +}