#!/usr/bin/perl # file: razor-caching-proxy.pl # #Copyright 2002 William Stearns #Released under the GPL. use strict; use IO::Socket qw(:DEFAULT :crlf); use IO::Socket; use Socket; use IO::Handle; use IO::File; use Getopt::Long; use POSIX 'WNOHANG'; use POSIX 'setsid'; use constant CACHE_VERSION => '0.4.3'; use constant DEFAULT_CACHE_FILE => '/var/cache/razor/razor-cache'; use constant DEFAULT_LOG_FILE => '/var/log/razor'; use constant DEFAULT_MAX_NEG_AGE => 20 * 60; # = 20 minutes use constant DEFAULT_MAX_POS_AGE => 30 * 24 * 60 * 60; # = 30 days use constant DEFAULT_RAZOR_PORT => 2702; use constant DEFAULT_REAL_SERVER_NAME => 'b.razor.vipul.net.'; use constant MAX_OTHER_AGE => 60 * 60; #Hardcoded 1 hour for non-Positive, non-Negative. #Handle interrupt (Ctrl-C) between client connections my $quit = 0; $SIG{INT} = sub { $quit++ }; #Handle, umm, ignore exiting child events $SIG{CHLD} = sub { while (waitpid(-1,WNOHANG)>0) { } }; #Handle TERM and INT with explicit exit so we can unlink PID file on the way out. $SIG{TERM} = $SIG{INT} = sub { exit 0; }; #Initial responses, we'll load additional responses from the command line specified cache file. #The Positive hash string listed below is from the sample spam included with the razor package. #This guarantees we'll always have at least one live spam signature to test against. my %responses = ( "cache_version" => "Razor Caching Server live, version " . CACHE_VERSION . ".", "key:d97b1764b257f4f67a7d7fc9e6988aace12c5a56&action:lookup" => "Positive d97b1764b257f4f67a7d7fc9e6988aace12c5a56" ); my %timestamps = ( "cache_version" => 2000000000, "key:d97b1764b257f4f67a7d7fc9e6988aace12c5a56&action:lookup" => 1022560800 ); #2000000000 is in May, 2033 :-), 1022560800 = Tue May 28, 2002 00:40 EDT #Command line options: my $cache_file = DEFAULT_CACHE_FILE; my $listen_port = DEFAULT_RAZOR_PORT; my $log_file = ''; my $foreground = 0; my $max_neg_age = DEFAULT_MAX_NEG_AGE; my $max_pos_age = DEFAULT_MAX_POS_AGE; my $real_server_name = DEFAULT_REAL_SERVER_NAME; my $real_server_port = DEFAULT_RAZOR_PORT; my $help = 0; my $verbosity = 0; #Other variables: my $missing_cache_warned = 0; my $parent_pid; my $log_fh; my $field_separator = '/'; #Don't use ':', '&', or alphanumeric as these are used in the actual queries and responses. #'/' and ',' seem to work fine, '|' does not. #Make sure you edit any cache files to use this new separator if you change this here. my $pid_file; if ($< == 0) { #pick an appropriate directory for the pid file $pid_file = '/var/run/razor-caching-proxy.pid'; #based on real UID is root or not. Allows non-root } else { #users to still run this. Yes, two copies could be $pid_file = '/var/tmp/razor-caching-proxy.pid'; #running at the same time, but not on the same port. } #Subroutines sub become_daemon { die "Can't fork" unless defined (my $child = fork); exit 0 if $child; #Parent exits setsid(); #Become session leader open(STDIN, "/dev/null"); open(STDERR, ">&STDOUT"); chdir '/'; umask(0); #forget file mode creation mask $ENV{PATH} = '/bin:/sbin:/usr/bin:/usr/sbin'; return $$; } #End sub become_daemon sub Debug { my $DebugLevel = shift; if ($verbosity >= $DebugLevel) { my $DebugString = shift; print $log_fh "$DebugString"; } } #End sub Debug sub open_pid_file { my $file = shift; if (-e $file) { #file already exists my $fh = IO::File->new($file) || return; my $pid = <$fh>; die "Server already running with PID $pid" if kill 0 => $pid; warn "Removing PID file for defunct server process $pid.\n"; die "Can't unlink PID file $file" unless -w $file && unlink $file; } my $pid_fh = IO::File->new($file, O_WRONLY|O_CREAT|O_EXCL, 0644) or die "Can't create $file: $!\n"; } #End sub open_pid_file sub saveline { #Example call: # saveline "$cache_file", "$timestamp$field_separator$recreated_query$field_separator$response"; my $cache_filename = shift; my $line_to_add = shift; #Save in cache file so we can load it in on future invocations. #Note that we save all entries, regardless of age. We also _load_ #all entries, regardles of age. The only check for whether an #entry should be retired is right after we get a query from a client; #too-old entries get deleted from the %timestamps and %reponses #hashes. That way we check at most 20 entries for age. At load time, #we'd have to check thousands to hundreds of thousands. if (my $cache_fh = IO::File->new("$cache_filename", O_WRONLY|O_APPEND)) { # |O_CREAT not used, security risk $cache_fh->autoflush(1); print $cache_fh "$line_to_add\n"; close $cache_fh; Debug 3, "stored \"$line_to_add\" in cache file: $cache_file.\n"; } elsif ($missing_cache_warned == 0) { #Warn once about missing or unwritable cache file. Debug 0, "Local cache file $cache_file unavailable or unwritable for storing new entries (create it if it's not there and check permissions, please): $!.\n"; $missing_cache_warned++; } } #End sub saveline my $USAGEMSG = < File that holds cached queries and responses ($cache_file) * --listen_port Port this proxy should listen on for incoming connections. ($listen_port) --log_file Filename to log debug messages to (STDERR). /var/log/razor, perhaps. * --foreground Run proxy in the current terminal instead of the default of running as a daemon --max_neg_age Maximum age of a Negative cache entry before it is discarded. ($max_neg_age) --max_pos_age Maximum age of a Positive cache entry before it is discarded. ($max_pos_age) --server_name Real server this proxy consults for uncached queries ($real_server_name) --server_name also legal (this :port overrides any --server_port) --server_port Port on which that server listens ($real_server_port) --help This help message --verbose Send debug output to stdout, use twice to see network details, three times to show all loads and stores too. * For security reasons, these files must be created before starting the proxy or they will not be used at all. If you would not like to cache Negative responses (the old --ignore_negative option), set --max_neg_age to some small number of seconds, say 60. To run independantly, i.e. without any parent server to consult, add --server_name '' Examples: To test the proxy (enable the echo service on localhost first): razor-caching-proxy.pl --server_n=localhost:7 --ca=/tmp/razor-cache -v Specify an alternate real server and listen for incoming connections on 8000: razor-caching-proxy.pl --server_n=a.razor.vipul.net. --listen_port 8000 USAGE #Load command line params. die "$USAGEMSG" unless GetOptions( 'cache_file=s' => \$cache_file, 'foreground' => \$foreground, 'listen_port=i' => \$listen_port, 'log_file=s' => \$log_file, 'max_neg_age=i' => \$max_neg_age, 'max_pos_age=i' => \$max_pos_age, 'server_name=s' => \$real_server_name, 'server_port=i' => \$real_server_port, 'ignore_negative' => sub { $max_neg_age = 0 }, 'help' => \$help, 'verbose+' => \$verbosity ); die "$USAGEMSG" if $help; if (($log_file eq '') && ($foreground == 0)) { $log_file = DEFAULT_LOG_FILE; } #Set up logging to a file or STDERR. if ($log_file ne '') { $log_fh = IO::File->new($log_file, O_WRONLY|O_APPEND) or die "Unable to open $log_file for append - please create and check permissions: $!.\n"; } else { $log_fh = IO::File->new_from_fd(\*STDERR,">") or die "Unable to reopen STDERR for logging: $!.\n"; } $log_fh->autoflush(1); #Don't use Debug before this point. #Split port number out if embedded in server name if ($real_server_name =~ /:/) { ($real_server_name, $real_server_port) = split(/:/, $real_server_name, 2); } #It appears the razor servers actually use 0x0a as the terminator for all queries and responses, so we use LF instead of CRLF. $/ = LF; #Input record separator #Setting up the server port so we can allow incoming connections. my $incoming_sock = IO::Socket::INET->new( Listen => 20, LocalPort => $listen_port, Proto => 'tcp', Timeout => 60*60, Reuse => 1) or die "Can't create listening socket: $!\n"; Debug 1, "Waiting for incoming connections on port $listen_port...\n"; my $pid_fh = open_pid_file($pid_file); if ($foreground) { $parent_pid = $$; } else { $parent_pid = become_daemon(); } print $pid_fh "$parent_pid"; close $pid_fh; while (!$quit) { next unless my $incoming_session = $incoming_sock->accept; my $peer_host = gethostbyaddr($incoming_session->peeraddr,AF_INET) || $incoming_session->peerhost; my $peer_port = $incoming_session->peerport; Debug 1, "Connection from [$peer_host,$peer_port]\n"; #fork off a child to handle $incoming_session defined (my $child = fork()) or die "Can't fork: $!"; if ($child == 0) { $incoming_sock->close; #Child doesn't care about the $incoming_sock listening socket, only the $incoming_session connected socket. my $real_server_sock; my $real_server_connected = 0; my ($client_requests, $cached_responses, $locally_generated_responses, $real_server_responses)= (0, 0, 0, 0); my ($positive_responses, $negative_responses, $other_responses) = (0, 0, 0); sub ClientStats { my $cli_req = shift; my $cache_resp = shift; my $local_resp = shift; my $serv_resp = shift; my $pos_resp = shift; my $neg_resp = shift; my $other_resp = shift; if ($cli_req > 0) { printf $log_fh ("%d client requests = %d (%.1f%%) served locally from cache + %d (%.1f%%) created locally + %d (%.1f%%) from the real server.\n", $cli_req, $cache_resp, (($cache_resp * 100 ) / $cli_req), $local_resp, (($local_resp * 100 ) / $cli_req), $serv_resp, (($serv_resp * 100 ) / $cli_req)); printf $log_fh ("%d positive responses, %d negative responses, %d other responses.\n", $pos_resp, $neg_resp, $other_resp); } else { printf $log_fh ("%d client requests.\n", $cli_req); } } #End sub ClientStats sub ConnectToServer { #Suggested call: #ConnectToServer $real_server_name, $real_server_port; #Uses (semi) global $real_server_sock and $real_server_connected . Sorry. :-) Passing filehandles looks messy. my $server_name = shift; my $server_port = shift; if ( ($real_server_name ne '') && (not($real_server_connected)) ) { Debug 2, "Opening connection to server.\n"; $real_server_sock = IO::Socket::INET -> new( PeerAddr => $server_name, PeerPort => $server_port, Timeout => 60 ) or die "Connection to $server_name:$server_port failed: $!\n"; $real_server_sock->autoflush(1); $real_server_connected = 1; Debug 1, "Connected to $server_name:$server_port.\n"; if ($server_port == 2702) { $/ = LF; my $server_banner=<$real_server_sock>; chomp $server_banner; Debug 2, "Server sent back \"$server_banner\".\n"; } } #non-null server name and not already connected } #End sub ConnectToServer #Load locally cached queries and responses from $cache_file #load after child forks so we always pull in the newest data. #We don't check records for age because it would involve additional work #for each cache entry at start time, and the records may become #too old between the time loaded and the time accessed. We'll discard a #given record at access time, restricting the work one proxy child needs to #do to about 20 timestamp checks instead of multiple hundreds of thousands. if (my $cache_fh = IO::File->new($cache_file, O_RDONLY)) { # |O_CREAT not used, security risk my $cache_query; my $cache_response; my $cache_timestamp; my $loaded_pairs = 0; undef $!; #process one cache entry from local file. while (defined(my $cache_line = <$cache_fh>)) { chomp $cache_line; ($cache_timestamp, $cache_query, $cache_response) = split(/$field_separator/, $cache_line, 3); Debug 3, "Read \"$cache_timestamp,$cache_query,$cache_response\".\n"; if ($cache_timestamp =~ /^key/) { die "$cache_file is in the old query/response format. Please delete and recreate it or convert to the new timestamp/query/response format - see the README file.\n"; } $cache_timestamp = 0 if ($cache_timestamp eq ''); #look for already cached entries; keep the newer entry. if ((defined $responses{$cache_query}) && (defined $timestamps{$cache_query})) { if ($timestamps{$cache_query} < $cache_timestamp) { #Existing array entry is older; replace with newer entry just loaded from cache file. Debug 2, "Multiple cache lines for query: $cache_query.\n"; $responses{$cache_query} = $cache_response; $timestamps{$cache_query} = $cache_timestamp; } } else { #Non-existing entry (or one where only one of $responses{} or $timestamps{} is set) - store it in responses array. $responses{$cache_query} = $cache_response; $timestamps{$cache_query} = $cache_timestamp; $loaded_pairs++; } } close $cache_file; Debug 1, "loaded $loaded_pairs unique cached queries from $cache_file.\n"; } elsif ($missing_cache_warned == 0) { #Warn once about missing or unreadable cache file. Debug 0, "Local cache file $cache_file unavailable or unreadable (create it if it's not there and check permissions, please): $!\n"; $missing_cache_warned++; } #End of load cache file entries #Note that we no longer automatically connect to the server at this point. #We wait until we actually need to send a query before even attempting to connect. #This may save us a bunch of connects if the keys we need are already in the cache. #print $incoming_session "Vipul's Razor Cache " . CACHE_VERSION . ", protocol version 2." . LF; #Client gets fussy about server version string. Reminds me of Win 3.x refusing to load over DR-DOS... :-) print $incoming_session "Vipul's Razor 1.11, protocol version 2.\n"; $/ = LF; while (my $client_query=<$incoming_session>) { #We need to strip off either or both of CR and LF. This gets CR, LF, or CRLF forms of line separator. $/ = LF; chomp $client_query; $/ = CR; chomp $client_query; $/ = LF; Debug 2, "-------- Query $client_query received.\n"; #if ((age=(currtime - sigtime)) > max_pos|neg_age) { discard key from both arrays } #Throw away $responses{$client_query} and $timestamps{$client_query} here if defined and too old. if (defined $responses{$client_query}) { if (defined $timestamps{$client_query}) { if ($responses{$client_query} =~ /^Positive /) { delete $responses{$client_query}, $timestamps{$client_query} if (time() - $timestamps{$client_query} > $max_pos_age); } elsif ($responses{$client_query} =~ /^Negative /) { delete $responses{$client_query}, $timestamps{$client_query} if (time() - $timestamps{$client_query} > $max_neg_age); } else { delete $responses{$client_query}, $timestamps{$client_query} if (time() - $timestamps{$client_query} > MAX_OTHER_AGE); } } else { delete $responses{$client_query}; } } if ($client_query eq '.') { #End of queries. Now we process the responses sent by the server. #If we managed to get this far without ever connecting to the server to send any #queries, there's no point in connecting now to get any responses, because there #won't be any. #If $real_server is '', all previous requests have already been handled at request time. if ($real_server_connected) { print $real_server_sock ".\n"; #Nudge server to send back all queued responses. #Grab all response lines the server has to give us. This approach doesn't care #if the server actually responds to all queries, or even adds in some answers #to questions we never asked, we just take whatever answers come back, #hand them off to the client, and cache as appropriate. $/ = LF; while (my $response=<$real_server_sock>) { Debug 2, "Response $response received.\n"; $real_server_responses++; chomp $response; print $incoming_session "$response\n"; my ($response_verdict, $response_key) = split(/\s+/, $response, 2); if ($response_verdict eq "Positive") { $positive_responses++; my $recreated_query = "key:$response_key&action:lookup"; $responses{$recreated_query} = $response; $timestamps{$recreated_query} = time(); saveline "$cache_file", "$timestamps{$recreated_query}$field_separator$recreated_query$field_separator$response" ; } elsif ($response_verdict eq "Negative") { $negative_responses++; my $recreated_query = "key:$response_key&action:lookup"; $responses{$recreated_query} = $response; $timestamps{$recreated_query} = time(); saveline "$cache_file", "$timestamps{$recreated_query}$field_separator$recreated_query$field_separator$response" ; } elsif ($response_verdict eq "Accepted") { $other_responses++; #One might argue this counts as positive, but... my $recreated_query = "key:$response_key&action:report"; $responses{$recreated_query} = $response; $timestamps{$recreated_query} = time(); #Create a simulated query to match future lookups for this reported key. my $simulated_positive_query = "key:$response_key&action:lookup"; my $simulated_positive_response = "Positive $response_key"; #We don't have to check for $ignore_negative here because all Accepted keys are Positive. $responses{$simulated_positive_query} = $simulated_positive_response; $timestamps{$simulated_positive_query} = $timestamps{$recreated_query}; saveline "$cache_file", "$timestamps{$recreated_query}$field_separator$recreated_query$field_separator$response"; saveline "$cache_file", "$timestamps{$recreated_query}$field_separator$simulated_positive_query$field_separator$simulated_positive_response"; } elsif ($response_verdict eq "Rejected") { $other_responses++; my $recreated_query = "key:$response_key&action:report"; $responses{$recreated_query} = $response; $timestamps{$recreated_query} = time(); #This wasn't accepted by the server, so we can't create a simulated Positive or Negative entry. saveline "$cache_file", "$timestamps{$recreated_query}$field_separator$recreated_query$field_separator$response"; } elsif ($response eq "Unknown command.") { #Do nothing, we can't match it up with a command (we already handed this off to the client). $other_responses++; Debug 2, "Ignoring \"Unknown command.\" response.\n"; } elsif ($real_server_port == 7) { #Echo port server; hand back responses as is. $other_responses++; $responses{$response} = $response; $timestamps{$response} = time(); saveline "$cache_file", "$timestamps{$response}$field_separator$response$field_separator$response"; } else { $other_responses++; Debug 1, "Unknown $response_verdict in $response.\n"; } } #End of loop to read all server responses close $real_server_sock; #We know we're connected from the above test. $real_server_connected = 0; } #End of "If we're connected to the server" Debug 1, "Connection from [$peer_host,$peer_port] finished.\n"; #The client waits for us to close the connection, so do so. close $incoming_session if ($incoming_session->connected); ClientStats $client_requests, $cached_responses, $locally_generated_responses, $real_server_responses, $positive_responses, $negative_responses, $other_responses if ($verbosity >= 1); exit 0; } elsif (defined $responses{$client_query}) { #We have a cached entry for this already, hand it back to the user. if ($responses{$client_query} =~ /^Positive/) { $positive_responses++; } elsif ($responses{$client_query} =~ /^Negative/) { $negative_responses++; } else { $other_responses++; } #We already checked for age above. $client_requests++; $cached_responses++; Debug 1, "Cached response to $client_query.\n"; print $incoming_session "$responses{$client_query}\n"; } elsif ($client_query =~ /action:report$/) { $client_requests++; if ($real_server_name ne '') { #Hand report action directly to server, do not cache at all. Debug 1, "Send off report $client_query.\n"; #Here we do a (late) connect to the real server if not already connected ConnectToServer $real_server_name, $real_server_port; print $real_server_sock "$client_query\n"; } else { #User has specified a server of '' - there's no parent to hand the query to. Handle locally. $locally_generated_responses++; $positive_responses++; #In the absence of blacklisted reporters, we'll accept all submissions and store a positive in the array and the cache file. Debug 1, "NullServer handle report $client_query.\n"; my ($junk1, $hash_action, $junk2) = split(/:/, $client_query, 3); #$hash_action=d9...56&action my ($hash, $junk3) = split(/&/, $hash_action, 2); my $simulated_query = "key:$hash&action:lookup"; $responses{$simulated_query} = "Positive $hash"; $timestamps{$simulated_query} = time(); saveline "$cache_file", "$timestamps{$simulated_query}$field_separator$simulated_query$field_separator$responses{$simulated_query}"; print $incoming_session "Accepted $hash\n"; } } elsif ($client_query =~ /action:lookup$/) { #We have no cached entry - ask the real server. $client_requests++; if ($real_server_name ne '') { Debug 1, "Live query to lookup $client_query.\n"; #Here we do a (late) connect to the real server if not already connected ConnectToServer $real_server_name, $real_server_port; print $real_server_sock "$client_query\n"; } else { #User has specified a server of '' - there's no parent to hand the query to. Handle locally. $locally_generated_responses++; $negative_responses++; Debug 1, "NullServer Live query to lookup $client_query.\n"; #They're looking up a hash we don't have. Hand back Negative. my ($junk1, $hash_action, $junk2) = split(/:/, $client_query, 3); #$hash_action=d9...56&action my ($hash, $junk3) = split(/&/, $hash_action, 2); print $incoming_session "Negative $hash\n"; } } else { $client_requests++; if ($real_server_name ne '') { Debug 1, "Live query to unknown $client_query.\n"; #Here we do a (late) connect to the real server if not already connected ConnectToServer $real_server_name, $real_server_port; print $real_server_sock "$client_query\n"; } else { #User has specified a server of '' - there's no parent to hand the query to. Handle locally. $other_responses++; $locally_generated_responses++; #Hmmm. It's neither an &action:report, an &action:lookup nor a '.'. What's the user asking? Debug 1, "NullServer Live query to Unknown $client_query.\n"; print $incoming_session "Unknown command.\n"; } } #End of block to handle different $client_query types } #End of loop to read $client_query lines Debug 1, "Connection from [$peer_host,$peer_port] finished.\n"; if ($real_server_connected) { close $real_server_sock; $real_server_connected = 0; } #The client waits for us to close the connection, so do so. close $incoming_session if ($incoming_session->connected); ClientStats $client_requests, $cached_responses, $locally_generated_responses, $real_server_responses, $positive_responses, $negative_responses, $other_responses if ($verbosity >= 1); exit 0; } #End of child task fork #The parent does not need to do anything more with the $incoming_session connected socket. close $incoming_session if ($incoming_session->connected); } #End of main parent "!$quit" loop close $incoming_sock; Debug 1, "Parent closing.\n"; #Remove the pid file on the way out. END { unlink $pid_file if $$ == $parent_pid; };