#!/usr/bin/perl # file: razor2-caching-proxy.pl # #Copyright 2002 William Stearns #Released under the GPL. die "This code is not in a functioning state yet, sorry.\n"; use strict; use IO::Socket qw(:DEFAULT :crlf); use IO::Socket; use Socket; use IO::Handle; use IO::File; use Getopt::Long; use POSIX 'WNOHANG'; use POSIX 'setsid'; use constant CACHE_VERSION => '1.9.0'; use constant DEFAULT_CACHE_FILE => '/var/cache/razor/razor2-cache'; use constant DEFAULT_LOG_FILE => '/var/log/razor2'; use constant DEFAULT_MAX_NEG_AGE => 20 * 60; # = 20 minutes use constant DEFAULT_MAX_POS_AGE => 30 * 24 * 60 * 60; # = 30 days use constant DEFAULT_RAZOR_PORT => 2703; use constant DEFAULT_REAL_SERVER_NAME => '209.204.62.150'; use constant MAX_OTHER_AGE => 60 * 60; #Hardcoded 1 hour for non-Positive, non-Negative. #Handle interrupt (Ctrl-C) between client connections my $quit = 0; $SIG{INT} = sub { $quit++ }; #Handle, umm, ignore exiting child events $SIG{CHLD} = sub { while (waitpid(-1,WNOHANG)>0) { } }; #Handle TERM and INT with explicit exit so we can unlink PID file on the way out. $SIG{TERM} = $SIG{INT} = sub { exit 0; }; #Initial responses, we'll load additional responses from the command line specified cache file. #The Positive hash string listed below is from the sample spam included with the razor package. #This guarantees we'll always have at least one live spam signature to test against. my %responses = ( "cache_version" => "Razor Caching Server live, version " . CACHE_VERSION . ".", ); #"key:d97b1764b257f4f67a7d7fc9e6988aace12c5a56&action:lookup" => "Positive d97b1764b257f4f67a7d7fc9e6988aace12c5a56" my %timestamps = ( "cache_version" => 2000000000, ); #2000000000 is in May, 2033 :-), 1022560800 = Tue May 28, 2002 00:40 EDT #"key:d97b1764b257f4f67a7d7fc9e6988aace12c5a56&action:lookup" => 1022560800 #Command line options: my $cache_file = DEFAULT_CACHE_FILE; my $listen_port = DEFAULT_RAZOR_PORT; my $log_file = ''; my $foreground = 0; my $max_neg_age = DEFAULT_MAX_NEG_AGE; my $max_pos_age = DEFAULT_MAX_POS_AGE; my $real_server_name = DEFAULT_REAL_SERVER_NAME; my $real_server_port = DEFAULT_RAZOR_PORT; my $help = 0; my $verbosity = 0; #Other variables: my $cache_read_fh; my $missing_cache_warned = 0; my $parent_pid; my $log_fh; my $field_separator = '/'; #Don't use ':', '&', '=' or alphanumeric as these are used in the actual queries and responses. #'/' and ',' seem to work fine, '|' does not. #Make sure you edit any cache files to use this new separator if you change this here. my $pid_file; if ($< == 0) { #pick an appropriate directory for the pid file $pid_file = '/var/run/razor2-caching-proxy.pid'; #based on real UID is root or not. Allows non-root } else { #users to still run this. Yes, two copies could be $pid_file = '/var/tmp/razor2-caching-proxy.pid'; #running at the same time, but not on the same port. } #Subroutines sub become_daemon { die "Can't fork" unless defined (my $child = fork); exit 0 if $child; #Parent exits setsid(); #Become session leader open(STDIN, "/dev/null"); open(STDERR, ">&STDOUT"); chdir '/'; umask(0); #forget file mode creation mask $ENV{PATH} = '/bin:/sbin:/usr/bin:/usr/sbin'; return $$; } #End sub become_daemon sub Debug { my $DebugLevel = shift; if ($verbosity >= $DebugLevel) { my $DebugString = shift; print $log_fh "$DebugString"; } } #End sub Debug sub open_pid_file { my $file = shift; if (-e $file) { #file already exists my $fh = IO::File->new($file) || return; my $pid = <$fh>; die "Server already running with PID $pid" if kill 0 => $pid; warn "Removing PID file for defunct server process $pid.\n"; die "Can't unlink PID file $file" unless -w $file && unlink $file; } my $pid_fh = IO::File->new($file, O_WRONLY|O_CREAT|O_EXCL, 0644) or die "Can't create $file: $!\n"; } #End sub open_pid_file sub saveline { #Example call: # saveline "$cache_file", "$timestamp$field_separator$original_query$field_separator$response"; my $cache_filename = shift; my $line_to_add = shift; #Save in cache file so we can load it in on future invocations. #Note that we save all entries, regardless of age. We also _load_ #all entries, regardles of age. The only check for whether an #entry should be retired is right after we get a query from a client; #too-old entries get deleted from the %timestamps and %reponses #hashes. That way we check at most 20 entries for age. At load time, #we'd have to check thousands to hundreds of thousands. if (my $cache_write_fh = IO::File->new("$cache_filename", O_WRONLY|O_APPEND)) { # |O_CREAT not used, security risk $cache_write_fh->autoflush(1); print $cache_write_fh "$line_to_add\n"; close $cache_write_fh; Debug 3, "stored \"$line_to_add\" in cache file: $cache_file.\n"; } elsif ($missing_cache_warned == 0) { #Warn once about missing or unwritable cache file. Debug 0, "Local cache file $cache_file unavailable or unwritable for storing new entries (create it if it's not there and check permissions, please): $!.\n"; $missing_cache_warned++; } } #End sub saveline my $USAGEMSG = < File that holds cached queries and responses ($cache_file) * --listen_port Port this proxy should listen on for incoming connections. ($listen_port) --log_file Filename to log debug messages to (STDERR). /var/log/razor2, perhaps. * --foreground Run proxy in the current terminal instead of the default of running as a daemon --max_neg_age Maximum age of a Negative cache entry before it is discarded. ($max_neg_age) --max_pos_age Maximum age of a Positive cache entry before it is discarded. ($max_pos_age) --server_name Real server this proxy consults for uncached queries ($real_server_name) --server_name also legal (this :port overrides any --server_port) --server_port Port on which that server listens ($real_server_port) --help This help message --verbose Send debug output to stdout, use twice to see network details, three times to show all loads and stores too. * For security reasons, these files must be created before starting the proxy or they will not be used at all. If you would not like to cache Negative responses, set --max_neg_age to some small number of seconds, say 60. Examples: To test the proxy (enable the echo service on localhost first): razor-caching-proxy.pl --server_n=localhost:7 --ca=/tmp/razor-cache -v Specify an alternate real server and listen for incoming connections on 8000: razor-caching-proxy.pl --server_n=a.razor.vipul.net. --listen_port 8000 USAGE #Add this back in once supported again. # To run independantly, i.e. without any parent server to consult, #add #--server_name '' #Load command line params. die "$USAGEMSG" unless GetOptions( 'cache_file=s' => \$cache_file, 'foreground' => \$foreground, 'listen_port=i' => \$listen_port, 'log_file=s' => \$log_file, 'max_neg_age=i' => \$max_neg_age, 'max_pos_age=i' => \$max_pos_age, 'server_name=s' => \$real_server_name, 'server_port=i' => \$real_server_port, 'ignore_negative' => sub { $max_neg_age = 0 }, 'help' => \$help, 'verbose+' => \$verbosity ); die "$USAGEMSG" if $help; if (($log_file eq '') && ($foreground == 0)) { $log_file = DEFAULT_LOG_FILE; } #Set up logging to a file or STDERR. if ($log_file ne '') { $log_fh = IO::File->new($log_file, O_WRONLY|O_APPEND) or die "Unable to open $log_file for append - please create and check permissions: $!.\n"; } else { $log_fh = IO::File->new_from_fd(\*STDERR,">") or die "Unable to reopen STDERR for logging: $!.\n"; } $log_fh->autoflush(1); #Don't use Debug before this point. #Split port number out if embedded in server name if ($real_server_name =~ /:/) { ($real_server_name, $real_server_port) = split(/:/, $real_server_name, 2); } if ($real_server_name eq '') { die "A real server to which we connect is required at the moment.\n"; } #It appears the razor2 servers use 0x0d0a as the terminator for all queries and responses, so we use CRLF. $/ = CRLF; #Input record separator #Setting up the server port so we can allow incoming connections. my $incoming_sock = IO::Socket::INET->new( Listen => 20, LocalPort => $listen_port, Proto => 'tcp', Timeout => 60*60, Reuse => 1) or die "Can't create listening socket: $!\n"; Debug 1, "Waiting for incoming connections on port $listen_port...\n"; my $pid_fh = open_pid_file($pid_file); if ($foreground) { $parent_pid = $$; } else { $parent_pid = become_daemon(); } print $pid_fh "$parent_pid"; close $pid_fh; #Load locally cached queries and responses from $cache_file #We used to load everything after child forks to pull in the newest data. #Now we load everything there before we accept any connections, but we leave #the filehandle open for read. Just before we fork off a child to handle a #connection, we try to read any new lines that came in since the last check # - see below. # #We don't check records for age because it would involve additional work #for each cache entry at start time, and the records may become #too old between the time loaded and the time accessed. We'll discard a #given record at access time, restricting the work one proxy child needs to #do to about 20 timestamp checks instead of multiple hundreds of thousands. if ($cache_read_fh = IO::File->new($cache_file, O_RDONLY)) { # |O_CREAT not used, security risk my $loaded_pairs = 0; undef $!; while (defined(my $cache_line = <$cache_read_fh>)) { #process one cache entry from local file. chomp $cache_line; my ($cache_timestamp, $cache_query, $cache_response) = split(/$field_separator/, $cache_line, 3); Debug 3, "Read \"$cache_timestamp,$cache_query,$cache_response\".\n"; if ($cache_timestamp =~ /^key/) { die "$cache_file is in the old query/response format. Please delete and recreate it or convert to the new timestamp/query/response format - see the README file.\n"; } $cache_timestamp = 0 if ($cache_timestamp eq ''); #look for already cached entries; keep the newer entry. if ((defined $responses{$cache_query}) && (defined $timestamps{$cache_query})) { if ($timestamps{$cache_query} < $cache_timestamp) { #Existing array entry is older; replace with newer entry just loaded from cache file. Debug 2, "Multiple cache lines for query: $cache_query.\n"; $responses{$cache_query} = $cache_response; $timestamps{$cache_query} = $cache_timestamp; } } else { #Non-existing entry (or one where only one of $responses{} or $timestamps{} is set) - store it in responses array. $responses{$cache_query} = $cache_response; $timestamps{$cache_query} = $cache_timestamp; $loaded_pairs++; } } #Don't close here - leave the filehandle open so we can recheck for more lines just before fork. Debug 1, "Initial load: loaded $loaded_pairs unique cached queries from $cache_file.\n"; } elsif ($missing_cache_warned == 0) { #Warn once about missing or unreadable cache file. Debug 0, "Local cache file $cache_file unavailable or unreadable (create it if it's not there and check permissions, please): $!\n"; $missing_cache_warned++; } #End of load cache file entries while (!$quit) { next unless my $incoming_session = $incoming_sock->accept; my $peer_host = gethostbyaddr($incoming_session->peeraddr,AF_INET) || $incoming_session->peerhost; my $peer_port = $incoming_session->peerport; Debug 1, "Connection from [$peer_host,$peer_port]\n"; #Last minute check for any new lines that showed up in the cache_file. my $loaded_pairs = 0; undef $!; while (defined(my $cache_line = <$cache_read_fh>)) { #process one cache entry from local file. chomp $cache_line; my ($cache_timestamp, $cache_query, $cache_response) = split(/$field_separator/, $cache_line, 3); Debug 3, "Read \"$cache_timestamp,$cache_query,$cache_response\".\n"; if ($cache_timestamp =~ /^key/) { die "$cache_file is in the old query/response format. Please delete and recreate it or convert to the new timestamp/query/response format - see the README file.\n"; } $cache_timestamp = 0 if ($cache_timestamp eq ''); #look for already cached entries; keep the newer entry. if ((defined $responses{$cache_query}) && (defined $timestamps{$cache_query})) { if ($timestamps{$cache_query} < $cache_timestamp) { #Existing array entry is older; replace with newer entry just loaded from cache file. Debug 2, "Multiple cache lines for query: $cache_query.\n"; $responses{$cache_query} = $cache_response; $timestamps{$cache_query} = $cache_timestamp; } } else { #Non-existing entry (or one where only one of $responses{} or $timestamps{} is set) - store it in responses array. $responses{$cache_query} = $cache_response; $timestamps{$cache_query} = $cache_timestamp; $loaded_pairs++; } } Debug 1, "loaded $loaded_pairs more unique cached queries from $cache_file.\n"; #fork off a child to handle $incoming_session defined (my $child = fork()) or die "Can't fork: $!"; if ($child == 0) { $incoming_sock->close; #Child doesn't care about the $incoming_sock listening socket, only the $incoming_session connected socket. close $cache_read_fh; #Child won't be doing any more reading from the cache file, just writing. my $real_server_sock; my $real_server_connected = 0; my ($client_requests, $cached_responses, $locally_generated_responses, $real_server_responses)= (0, 0, 0, 0); my ($positive_responses, $negative_responses, $other_responses) = (0, 0, 0); my @outstanding_requests = ( ); #a fifo for the requests sent off: #push (@outstanding_requests, $new_request); to add it to the end of the array. #$head_request = shift(@outstanding_requests); to pull off first entry. sub ClientStats { my $cli_req = shift; my $cache_resp = shift; my $local_resp = shift; my $serv_resp = shift; my $pos_resp = shift; my $neg_resp = shift; my $other_resp = shift; if ($cli_req > 0) { printf $log_fh ("%d client requests = %d (%.1f%%) served locally from cache + %d (%.1f%%) created locally + %d (%.1f%%) from the real server.\n", $cli_req, $cache_resp, (($cache_resp * 100 ) / $cli_req), $local_resp, (($local_resp * 100 ) / $cli_req), $serv_resp, (($serv_resp * 100 ) / $cli_req)); printf $log_fh ("%d positive responses, %d negative responses, %d other responses.\n", $pos_resp, $neg_resp, $other_resp); } else { printf $log_fh ("%d client requests.\n", $cli_req); } } #End sub ClientStats sub ConnectToServer { #Suggested call: #ConnectToServer $real_server_name, $real_server_port; #Uses (semi) global $real_server_sock and $real_server_connected . Sorry. :-) Passing filehandles looks messy. my $server_name = shift; my $server_port = shift; if ( ($real_server_name ne '') && (not($real_server_connected)) ) { Debug 2, "Opening connection to server.\n"; $real_server_sock = IO::Socket::INET -> new( PeerAddr => $server_name, PeerPort => $server_port, Timeout => 60 ) or die "Connection to $server_name:$server_port failed: $!\n"; $real_server_sock->autoflush(1); $real_server_connected = 1; Debug 1, "Connected to $server_name:$server_port.\n"; #proto2 start if ($server_port == DEFAULT_RAZOR_PORT) { my $server_banner=<$real_server_sock>; chomp $server_banner; Debug 2, "Server sent back \"$server_banner\".\n"; print $incoming_session "$server_banner" . CRLF; } } #non-null server name and not already connected } #End sub ConnectToServer #We automatically connect to the server at this point. #Razor1 proxy waits until we actually need to send a query before even attempting to connect, #which saves us a bunch of connects if the keys we need are already in the cache. Unfortunately, #the razor2 client needs the server banner to determine the ephemeral seed. ConnectToServer $real_server_name, $real_server_port; while (my $client_query=<$incoming_session>) { chomp $client_query; if ($client_query =~ /^-/) { $client_query =~ s/^.//s; #strip leading "-" } Debug 2, "-------- Query $client_query received.\n"; #if ((age=(currtime - sigtime)) > max_pos|neg_age) { discard key from both arrays } #Throw away $responses{$client_query} and $timestamps{$client_query} here if defined and too old. if (defined $responses{$client_query}) { if (defined $timestamps{$client_query}) { if ($responses{$client_query} =~ /^Positive /) { delete $responses{$client_query}, $timestamps{$client_query} if (time() - $timestamps{$client_query} > $max_pos_age); } elsif ($responses{$client_query} =~ /^Negative /) { delete $responses{$client_query}, $timestamps{$client_query} if (time() - $timestamps{$client_query} > $max_neg_age); } else { delete $responses{$client_query}, $timestamps{$client_query} if (time() - $timestamps{$client_query} > MAX_OTHER_AGE); } } else { delete $responses{$client_query}; } } if ($client_query eq '.') { #End of this block of queries. Now we process the responses sent by the server. #If we managed to get this far without ever connecting to the server to send any #queries, there's no point in connecting now to get any responses, because there #won't be any. #If $real_server is '', all previous requests have already been handled at request time. if ($real_server_connected) { print $real_server_sock "." . CRLF; #Nudge server to send back all queued responses. #Grab all response lines the server has to give us. This approach doesn't care #if the server actually responds to all queries, or even adds in some answers #to questions we never asked, we just take whatever answers come back, #hand them off to the client, and cache as appropriate. SERVER_RESP: while (my $response=<$real_server_sock>) { if ($reponse =~ /^-/) { $reponse =~ s/^.//s; #strip leading "-" } if ($response == '.') { #Stop reading from server once we hit a '.', switch back to reading from client. last SERVER_RESP; } Debug 2, "Response $response received.\n"; $real_server_responses++; chomp $response; print $incoming_session "$response" . CRLF; if ($response =~ /p=1$/) { #Positive $positive_responses++; if (@outstanding_requests) { #Check for an empty @outstanding_requests at each shift. my $original_query = shift(@outstanding_requests); $responses{$original_query} = $response; $timestamps{$original_query} = time(); saveline "$cache_file", "$timestamps{$original_query}$field_separator$original_query$field_separator$response" ; } else { Debug 1, "Ouch! Ran out of queries to match up with responses at response $reponse.\n"; } } elsif ($response =~ /p=0$/) { #Negative $negative_responses++; if (@outstanding_requests) { my $original_query = shift(@outstanding_requests); $responses{$original_query} = $response; $timestamps{$original_query} = time(); saveline "$cache_file", "$timestamps{$original_query}$field_separator$original_query$field_separator$response" ; } else { Debug 1, "Ouch! Ran out of queries to match up with responses at response $reponse.\n"; } } elsif ($response =~ /res=1$/) { #Accepted $other_responses++; #One might argue this counts as positive, but... if (@outstanding_requests) { my $original_query = shift(@outstanding_requests); $responses{$original_query} = $response; $timestamps{$original_query} = time(); #FIXME - redo for razor2 ##Create a simulated query to match future lookups for this reported key. #my $simulated_positive_query = "key:$response_key&action:lookup"; #my $simulated_positive_response = "Positive $response_key"; ##We don't have to check for $ignore_negative here because all Accepted keys are Positive. #$responses{$simulated_positive_query} = $simulated_positive_response; #$timestamps{$simulated_positive_query} = $timestamps{$original_query}; saveline "$cache_file", "$timestamps{$original_query}$field_separator$original_query$field_separator$response"; #saveline "$cache_file", "$timestamps{$original_query}$field_separator$simulated_positive_query$field_separator$simulated_positive_response"; } else { Debug 1, "Ouch! Ran out of queries to match up with responses at response $reponse.\n"; } } elsif ($response =~ /res=0$/) { #Rejected $other_responses++; if (@outstanding_requests) { my $original_query = shift(@outstanding_requests); $responses{$original_query} = $response; $timestamps{$original_query} = time(); #This wasn't accepted by the server, so we can't create a simulated Positive or Negative entry. saveline "$cache_file", "$timestamps{$original_query}$field_separator$original_query$field_separator$response"; } else { Debug 1, "Ouch! Ran out of queries to match up with responses at response $reponse.\n"; } } elsif ($real_server_port == 7) { #Echo port server; hand back responses as is. $other_responses++; $responses{$response} = $response; $timestamps{$response} = time(); saveline "$cache_file", "$timestamps{$response}$field_separator$response$field_separator$response"; } elsif ($response =~ /^err=/) { #Error message, just keep stat on it. $other_responses++; } else { $other_responses++; Debug 1, "Unknown $response.\n"; } } #End of loop to read all server responses } #End of "If we're connected to the server" } elsif (defined $responses{$client_query}) { #We have a cached entry for this already, hand it back to the user. if ($responses{$client_query} =~ /p=1$/) { $positive_responses++; } elsif ($responses{$client_query} =~ /p=0$/) { $negative_responses++; } else { $other_responses++; } #We already checked for age above. $client_requests++; $cached_responses++; Debug 1, "Cached response to $client_query.\n"; print $incoming_session "$responses{$client_query}" . CRLF; } elsif ($client_query =~ /^a=r&/) { #report $client_requests++; #FIXME - reinstate null server handling #if ($real_server_name ne '') { #Hand report action directly to server, do not cache at all. Debug 1, "Send off report $client_query.\n"; #Here we do a (late) connect to the real server if not already connected ConnectToServer $real_server_name, $real_server_port; print $real_server_sock "$client_query" . CRLF; push (@outstanding_requests, $client_query); #} else { #User has specified a server of '' - there's no parent to hand the query to. Handle locally. # $locally_generated_responses++; # $positive_responses++; # #In the absence of blacklisted reporters, we'll accept all submissions and store a positive in the array and the cache file. # Debug 1, "NullServer handle report $client_query.\n"; # my ($junk1, $hash_action, $junk2) = split(/:/, $client_query, 3); #$hash_action=d9...56&action # my ($hash, $junk3) = split(/&/, $hash_action, 2); # # my $simulated_query = "key:$hash&action:lookup"; # $responses{$simulated_query} = "Positive $hash"; # $timestamps{$simulated_query} = time(); # # saveline "$cache_file", "$timestamps{$simulated_query}$field_separator$simulated_query$field_separator$responses{$simulated_query}"; # # print $incoming_session "Accepted $hash" . CRLF; #} } elsif ($client_query =~ /^a=c&/) { #Check if spam. We have no cached entry - ask the real server. $client_requests++; #FIXME - reinstate null server handling #if ($real_server_name ne '') { Debug 1, "Live query to lookup $client_query.\n"; #Here we do a (late) connect to the real server if not already connected ConnectToServer $real_server_name, $real_server_port; print $real_server_sock "$client_query" . CRLF; push (@outstanding_requests, $client_query); #} else { #User has specified a server of '' - there's no parent to hand the query to. Handle locally. # $locally_generated_responses++; # $negative_responses++; # Debug 1, "NullServer Live query to lookup $client_query.\n"; # #They're looking up a hash we don't have. Hand back Negative. # my ($junk1, $hash_action, $junk2) = split(/:/, $client_query, 3); #$hash_action=d9...56&action # my ($hash, $junk3) = split(/&/, $hash_action, 2); # print $incoming_session "Negative $hash" . CRLF; #} #FIXME - relay entire message bodies. } else { $client_requests++; #FIXME - reinstate null server handling #if ($real_server_name ne '') { Debug 1, "Live query to unknown $client_query.\n"; #Here we do a (late) connect to the real server if not already connected ConnectToServer $real_server_name, $real_server_port; print $real_server_sock "$client_query" . CRLF; push (@outstanding_requests, $client_query); #} else { #User has specified a server of '' - there's no parent to hand the query to. Handle locally. # $other_responses++; # $locally_generated_responses++; # #Hmmm. It's neither an &action:report, an &action:lookup nor a '.'. What's the user asking? # Debug 1, "NullServer Live query to Unknown $client_query.\n"; # print $incoming_session "Unknown command." . CRLF; #} } #End of block to handle different $client_query types } #End of loop to read $client_query lines Debug 1, "Connection from [$peer_host,$peer_port] finished.\n"; if ($real_server_connected) { close $real_server_sock; $real_server_connected = 0; } #The client waits for us to close the connection, so do so. close $incoming_session if ($incoming_session->connected); ClientStats $client_requests, $cached_responses, $locally_generated_responses, $real_server_responses, $positive_responses, $negative_responses, $other_responses if ($verbosity >= 1); exit 0; } #End of child task fork #proto2 end #The parent does not need to do anything more with the $incoming_session connected socket. close $incoming_session if ($incoming_session->connected); } #End of main parent "!$quit" loop close $incoming_sock; close $cache_read_fh; Debug 1, "Parent closing.\n"; #Remove the pid file on the way out. END { unlink $pid_file if $$ == $parent_pid; };