
chetwyn
Novice
Apr 26, 2006, 6:10 AM
Post #4 of 6
(1202 views)
|
|
Re: [davorg] Pattern matching issue with HTML file, please help...
[In reply to]
|
Can't Post
|
|
OK. Here's what I've done... It works prtty well. I'm not sure how to format in this forum but here it is: #!/usr/bin/perl srand(); $progname = $0; $progname =~ s@(.*?)(/|\\)@@ig; $generate = ''; $credit = <<EOT; HTML Formatter EOT $amount = 0; $replace = 0; $lower = 0; $upper = 0; $quiet = 0; $convert = 0; $usetabs = 0; $tags = "<html|</html|<body|</body|<head|</head|<title"; $tags .= "|<isindex|<link|<meta|<!doctype"; $tags .= "|<table|<tr|<th|<td|</th|</tr|</table|<caption"; $tags .= "|<thead|</thead|<tbody|</tbody"; $tags .= "|<p|<br|<blockquote|<hr|<center|</center|<div|</div"; $tags .= "|<col|<colgroup"; $tags .= "|<marquee|</marquee"; $tags .= "|<style|</style"; $tags .= "|<h1|<h2|<h3|<h4|<h5|<h6"; $tags .= "|<ul|</ul|<ol|</ol|<dl|</dl|<li|<dt|<dd|<dir|</dir|<menu|</menu"; $tags .= "|<map|<area|</map"; $tags .= "|<base|<basefont|<bgsound"; $tags .= "|<object|<applet|<param|</object|</applet|<embed|</embed"; $tags .= "|<frameset|<frame|<noframes|</noframes|</frame|</frameset"; $tags .= "|<form|</form|<input|<select|<option|</select|<textarea"; $tagindent = "<table|<tr|<td"; $tagindent .= "|<select|<form"; $tagindent .= "|<frameset"; $tagindent .= "|<ul|<ol|<dl|<dir|<menu|<map"; $tagunindent = "</table|</tr|</td"; $tagunindent .= "|</select|</form"; $tagunindent .= "|</frameset"; $tagunindent .= "|</ul|</ol|</dl|</dir|</menu|</map"; NGetOpt('n:i','r','l','u','q','h','t','todo'); if ($opt_todo) { print <<EOT; $credit To do list: Everything's done? Everything's DONE! So bring it on EOT exit; } @files = @ARGV; if ($#files+1 <= 0 || $opt_h) { usage(); exit(0); } if (defined($opt_n) && $opt_n >= 0) { $amount = $opt_n; } if (defined($opt_t)) { $usetabs = 1; } if ($opt_r) { $replace = 1; } if ($opt_l) { $lower = 1; } if ($opt_u) { $upper = 1; } if ($opt_c) { $convert = 1; } if ($opt_q) { $quiet = 1; } if (!$quiet) { print "$credit\n\nProcessing...\n"; } chdir('.') ? print "+\n" : print "-\n"; opendir(DIR,'.'); @files = readdir(DIR); foreach(@files) { @lines = (); @temp = (); if($_ !~ m/\.(htm.*?)/gi){ next; } $filein = $_; if ($replace) { $fileout = $filein; } else { $fileout = "$filein.out"; } open(i,"<$filein") || die "Can't open $_ "; while (!eof(i)) { $line = <i>; push @temp, $line; } close(i); splitlines(); if (!$quiet) { print "$_\n"; } $SCRIPT = 0; $COMMENT = 0; $PRE = 0; $temp = ''; foreach(@lines) { $SCRIPT = 0 if ($line =~ m@(</script|%>)@ig); $COMMENT = 0 if ($line =~ m@(-->|</comment)@ig); $PRE = 0 if ($line =~ m@</pre>@ig); $line = $_; $SCRIPT = 1 if ($line =~ m@(<script|<%)@ig); $COMMENT = 1 if ($line =~ m@(<!--|<comment)@ig); $PRE = 1 if ($line =~ m@<pre@ig); if (!$SCRIPT && !$COMMENT && !$PRE) { $line =~ s/\t//ig; $line =~ s/<\ /</ig; $line =~ s/\ >/>/ig; if ($line =~ />$/) { $temp .= $line; } else { $temp .= $line." "; } } else { $temp .= "\n".$line."\n"; } } push @temp, $temp; splitlines(); $SCRIPT = 0; $COMMENT = 0; $PRE = 0; foreach(@lines) { $SCRIPT = 0 if ($line =~ m@(</script|%>)@ig); $COMMENT = 0 if ($line =~ m@(-->|</comment)@ig); $PRE = 0 if ($line =~ m@</pre>@ig); $line = $_; $SCRIPT = 1 if ($line =~ m@(<script|<%)@ig); $COMMENT = 1 if ($line =~ m@(<!--|<comment)@ig); $PRE = 1 if ($line =~ m@<pre@ig); if (!$SCRIPT && !$COMMENT && !$PRE) { $line =~ s/\ {2,}/\ /ig; $line =~ s@($tags)@\n$1@ig; if ($convert) { $line =~ s@©@©@ig; $line =~ s@®@®@ig; } if ($upper || $lower || $convert) { for($i=0; $i<length($line); $i++) { $char = substr($line,$i,1); if ($char eq '<') { $in = 1; } if ($char eq '>') { $in = 0; } if ($char eq '"') { if ($quote) { $quote = 0; } else { $quote = 1; } } if ($in && !$quote) { substr($line,$i,1) = uc($char) if $upper; substr($line,$i,1) = lc($char) if $lower; } if (!$quote) { if (ord($char) == 169) { substr($line,$i,1) = "©"; } if (ord($char) == 174) { substr($line,$i,1) = "®"; } } } } } push @temp, $line; } splitlines(); $indent = 0; $SCRIPT = 0; $COMMENT = 0; $PRE = 0; foreach (@lines) { $SCRIPT = 0 if ($line =~ m@(</script|%>)@ig); $COMMENT = 0 if ($line =~ m@(-->|</comment)@ig); $PRE = 0 if ($line =~ m@</pre>@ig); $line = $_; $SCRIPT = 1 if ($line =~ m@(<script|<%)@ig); $COMMENT = 1 if ($line =~ m@(<!--|<comment)@ig); $PRE = 1 if ($line =~ m@<pre@ig); $spaces = ""; if (!$SCRIPT && !$COMMENT && !$PRE) { $line =~ s@(\ $)@@ig; $indent -= $line =~ s@($tagunindent)@$1@ig; $spaces = ""; for ($j=0; $j<$indent; $j++) { for ($k=0; $k<$amount; $k++) { if ($usetabs) { $spaces .= "\t"; } else { $spaces .= " "; } } } } push @temp, $spaces.$line; if (!$SCRIPT && !$COMMENT && !$PRE) { $indent += $line =~ s/($tagindent)/$1/ig; } } splitlines(); open (o, ">$fileout"); foreach (@lines) { print o "$_\n"; } close(o); } open(x, ">Results.txt")||die"can not open result file\n"; chdir('.') ? print "+\n" : print "-\n"; opendir(DIR,'.'); @files = readdir(DIR); foreach(@files) { if($_ !~ m/\.(out.*?)/gi){ next; } #final fucking stretch - thank fuck for that!!!!! #load the shit into memory and finish it's sorry ass off(sorry, I'm a little tired now) #yer... you own me a beer for this one... 'heh' 'heh' open(p,"$_")||die"can not open $file\n"; my @loadit=<p>; close(p); foreach (@loadit){ $_ =~ s/\s+//; $_ =~ s/\<L.*//; if($_ eq ''){ next; } if($_ =~ m/class=listingName/gi){ if(!($_ =~ /\<A/gi)){ ($name) = $_ =~ /\>(.*)/; $name =~ s/&/&/gi; print x "\n$name\n"; } else { ($name) = $_ =~ /\">(.*)\</mgi; $name =~ s/&/&/gi; print x "\n$name\n"; } } if($_ =~ m/class=gold/gi){ my ($address) = $_ =~ /\>(.*)/; if(!($address =~ m/^(ph)/gi)){ print x "$address\n\n"; } }elsif($_ =~ m/class=free/gi){ ($address) = $_ =~ /\>(.*)/gi; print x "$address\n\n"; } } } close(x); cleaner(); print "\n\nExtraction Complete\n\n"; exit; sub splitlines { @lines = (); foreach(@temp) { $line = $_; if ($line eq "\n") { # This preserves blank lines in script and comments. push @lines, " "; } else { push @lines, split(/\n/, $line); } } @temp = (); } sub cleaner(){ if(!(-e 'processed')){ system("md processed"); } if(!(-e 'html_files')){ system("md html_files"); } chdir('.') ? print "+\n" : print "-\n"; opendir(DIR,'.'); my @files = readdir(DIR); foreach(@files) { if($_ =~ m/\.(out.*?)/gi){ system("copy \"$_\" processed"); system ("del \"$_\""); } elsif ($_ =~ m/\.(htm.*?)/gi){ system("copy \"$_\" html_files"); system ("del \"$_\""); } } } sub usage { print <<EOT; $credit Usage: $progname [-options] filespec [filespec...] filespec is a filename or filename pattern (e.g. *.htm) -n Number of tabs/spaces to indent (default: 0) -t Uses tabs instead of spaces for indenting -r Replace original with new file -l Convert tags to lower-case -u Convert tags to upper-case -c Convert (C) -> © and (R) -> ® -h Help -todo To do list EOT }
(This post was edited by chetwyn on Apr 26, 2006, 6:12 AM)
|