#!/usr/bin/perl # # ---------------------------- # -------- FilterHTML -------- # ---------------------------- # # A perl script for removing all html tags from an HTML # document. Feed it and html file and it'll return just the # text without the tags. # It does not deal with tags that cross lines. # This script was written by William Stearns # (wstearns@pobox.com). I havent decided about the license # yet, but it'll probably be GPL later... # Copyright 1997 by William Stearns. No employer has # any rights to this program over the rights granted by # the GPL as this was developed on personal time. # # \s is [ \r\t\n\f] # \w is [_0-9a-zA-Z] # \{, \} for counter # \- for range # \/ for escapes # # sort order: !"#$%&'()*+,-./0-9:;< a-z{|}~ $line = ""; do { $newline = ; $line .= $newline; $_ = $newline; $lts = tr//>/; # if ($lts != $gts) { # print ($lts . "\t" . $gts); # } } until ( ($lts == $gts) or ($newline eq "") ); while ($line ne "") { # $line =~ s///ig; # Only open and close $line =~ s/<\/?(address|b|banner|blink|blockquote|body|br)>//ig; $line =~ s/<\/?(caption|center|cite|code|colgroup)>//ig; $line =~ s/<\/?(dd|div|dl|dt|em|font|form|frameset)>//ig; $line =~ s/<\/?(head|html|i|menu|nobr|noframe|noframes|ol)>//ig; $line =~ s/<\/?(p|pre|select|sl|small|strong|style|sup)>//ig; $line =~ s/<\/?(table|td|th|title|tr|tt|u|ul)>//ig; # Simple $line =~ s/<(dl compact|hr|ul plain)>//ig; $line =~ s/<(p|div)\salign\s?=\s?"?(left|center|right)"?>//ig; $line =~ s/<\/?h[1-6]>//ig; $line =~ s/