Last Google Spam Filter
Like a twelve year-old girl on myspace, Last Google has been a target to the unwanted attentions of vile, disgusting predators almost since its inception in 2004. Necessary, grudging steps have been taken to control the scourge, but the struggle has evolved into the inevitable arms race between the diseased corps of the Windows-fueled botnet and one lethargic developer pursuing a hobby of sorts in his free-time.This page documents some of the steps I've taken to keep the scourge in check. My efforts on this subject are also chronicled from time to time in the lastgeist blog.
If you have any comments, inquiries, or suggestions, feel free to contact me at my email address.
Spam Filter Script
This is the latest php script I am using to catch form spam<?php
/*** DOCUMENTATION LAYER
Last Google Query Validation Library
Last Update: Jul 2007
Author: Tom at klenwell@gmail.com
FUNCTIONS
lg_clean_query($query)
lg_is_spam($string)
lg_on_spam_list($query)
lg_spam_preg_match($query)
lg_spam_redirect()
NOTES
______________________________________________________________________________*/
// GLOBALS (LGSS: Last Google Site Security)
$LGSS['spam_trigger'] = '';
$LGSS['default_redirect'] = 'http://klenwell.net/is/SpamTrap';
$LGSS['TERM'] = array
(
'hydrocodone', 'phentermine', // spam terms
'[url' // forum spam
);
$LGSS['regex'] = '%' // begin regex
. '(lo\d+)' // e.g. lo1239 -- something to do with loopbacks?
. '|(<\s*br[^\>]*>)' // don't need line breaks
. '|(viagra)' // common spam terms
. '%iU'; // end regex
// lg_clean_query
/*____________________________________________________________________________*/
function lg_clean_query($query)
{
// *** DATA
// return
$clean_query = '';
// *** MANIPULATE
// trim
$query = trim($query);
// chop at newline
$nl_pos = strpos($query, "\n");
if ( $nl_pos > 0 ) $query = substr($query, 0, $nl_pos);
$clean_query = strtolower($query);
// clean up extra spaces
$clean_query = preg_replace('%\s+%', ' ', $clean_query);
$clean_query = trim($clean_query);
// *** RETURN
return $clean_query;
}
/*____________________________________________________________________________*/
// lg_is_spam
/*____________________________________________________________________________*/
function lg_is_spam($string)
{
// *** DATA
// Global
global $PROJECT;
// Return
$is_spam = 0;
// *** MANIPULATE
// sanity check
if ( empty($string) ) return 0;
// * Cases (add as needed)
// css-hidden text field (bot bait) source: http://it.slashdot.org/comments.pl?sid=247229&cid=19797293
// note: not actual field name
if ( $_POST['bot_trap'] ) $is_spam = 1;
// check spam list
elseif ( lg_on_spam_list($string) ) $is_spam = 1;
// regex check
elseif ( lg_spam_preg_match($string) ) $is_spam = 1;
// too many urls
elseif ( substr_count($string, 'http://') > 1 ) $is_spam = 1;
// endline signal bot in this case (via spoofed blogspot form)
// lg_clean_query should fix this
#elseif ( strpos($string,"\n") !== FALSE ) $is_spam = 1;
// clean
else $is_spam = 0;
// *** RETURN
return $is_spam;
} // end Fx
/*____________________________________________________________________________*/
// is spam term
/*____________________________________________________________________________*/
function lg_on_spam_list($query)
{
// *** DATA
// globals
global $LGSS;
// internal
$_QUERY = array();
// return
$is_spam = 0;
// *** MANIPULATE
// split query
$query = trim($query);
$_QUERY = explode(' ', $query);
// cycle
foreach ( $_QUERY as $_term )
{
if ( in_array($_term, $LGSS['TERM']) )
{
#trigger_notice(__FUNCTION__ . ' ' . $_term);
$is_spam = 1;
$LGSS['spam_trigger'] = $_term;
break;
}
}
// *** RETURN
return $is_spam;
}
/*____________________________________________________________________________*/
// in spam regex
/*____________________________________________________________________________*/
function lg_spam_preg_match($query)
{
// *** DATA
// globals
global $LGSS;
// return
$is_spam = 0;
// *** MANIPULATE
if ( preg_match($LGSS['regex'], $query, $MATCH) )
{
$is_spam = 1;
$LGSS['spam_trigger'] = htmlspecialchars($MATCH[0]);
#trigger_notice($MATCH); trigger_notice(__FUNCTION__ . ' : ' . $query);
}
// *** RETURN
return $is_spam;
}
/*____________________________________________________________________________*/
// Testbed
/*____________________________________________________________________________*/
if ( 0 )
{
$_TEST[] = "test site <br />";
$_TEST[] = "test site br />";
$_TEST[] = "lo831l";
$_TEST[] = <<<TEST
Very fabulos site!
idiot test
idiot test
[URL=http://idiotest.9999mb.com]idiot test[/URL]
idiot test http://idiotest.9999mb.com
Thank you!
TEST;
$_TEST[] = ' PhenTermine ';
$_TEST[] = 'this is a legit query';
// test clean query
foreach ( $_TEST as $test )
{
$result = lg_clean_query($test);
$test = htmlspecialchars($test);
echo "<p><b>$test</b> --lg_clean_query--> $result</p>";
}
echo "<h2>begin spam test</h2>";
foreach ( $_TEST as $test )
{
$test = lg_clean_query($test);
$result = lg_is_spam($test) ? 'spam' : 'ok';
$test = htmlspecialchars($test);
echo "<p><b>$test</b> --lg_is_spam--> $result --LGSS['spam_trigger']--> {$LGSS['spam_trigger']}</p>";
$LGSS['spam_trigger'] = '';
}
die(__FILE__);
}
/*____________________________________________________________________________*/
?>
/*** DOCUMENTATION LAYER
Last Google Query Validation Library
Last Update: Jul 2007
Author: Tom at klenwell@gmail.com
FUNCTIONS
lg_clean_query($query)
lg_is_spam($string)
lg_on_spam_list($query)
lg_spam_preg_match($query)
lg_spam_redirect()
NOTES
______________________________________________________________________________*/
// GLOBALS (LGSS: Last Google Site Security)
$LGSS['spam_trigger'] = '';
$LGSS['default_redirect'] = 'http://klenwell.net/is/SpamTrap';
$LGSS['TERM'] = array
(
'hydrocodone', 'phentermine', // spam terms
'[url' // forum spam
);
$LGSS['regex'] = '%' // begin regex
. '(lo\d+)' // e.g. lo1239 -- something to do with loopbacks?
. '|(<\s*br[^\>]*>)' // don't need line breaks
. '|(viagra)' // common spam terms
. '%iU'; // end regex
// lg_clean_query
/*____________________________________________________________________________*/
function lg_clean_query($query)
{
// *** DATA
// return
$clean_query = '';
// *** MANIPULATE
// trim
$query = trim($query);
// chop at newline
$nl_pos = strpos($query, "\n");
if ( $nl_pos > 0 ) $query = substr($query, 0, $nl_pos);
$clean_query = strtolower($query);
// clean up extra spaces
$clean_query = preg_replace('%\s+%', ' ', $clean_query);
$clean_query = trim($clean_query);
// *** RETURN
return $clean_query;
}
/*____________________________________________________________________________*/
// lg_is_spam
/*____________________________________________________________________________*/
function lg_is_spam($string)
{
// *** DATA
// Global
global $PROJECT;
// Return
$is_spam = 0;
// *** MANIPULATE
// sanity check
if ( empty($string) ) return 0;
// * Cases (add as needed)
// css-hidden text field (bot bait) source: http://it.slashdot.org/comments.pl?sid=247229&cid=19797293
// note: not actual field name
if ( $_POST['bot_trap'] ) $is_spam = 1;
// check spam list
elseif ( lg_on_spam_list($string) ) $is_spam = 1;
// regex check
elseif ( lg_spam_preg_match($string) ) $is_spam = 1;
// too many urls
elseif ( substr_count($string, 'http://') > 1 ) $is_spam = 1;
// endline signal bot in this case (via spoofed blogspot form)
// lg_clean_query should fix this
#elseif ( strpos($string,"\n") !== FALSE ) $is_spam = 1;
// clean
else $is_spam = 0;
// *** RETURN
return $is_spam;
} // end Fx
/*____________________________________________________________________________*/
// is spam term
/*____________________________________________________________________________*/
function lg_on_spam_list($query)
{
// *** DATA
// globals
global $LGSS;
// internal
$_QUERY = array();
// return
$is_spam = 0;
// *** MANIPULATE
// split query
$query = trim($query);
$_QUERY = explode(' ', $query);
// cycle
foreach ( $_QUERY as $_term )
{
if ( in_array($_term, $LGSS['TERM']) )
{
#trigger_notice(__FUNCTION__ . ' ' . $_term);
$is_spam = 1;
$LGSS['spam_trigger'] = $_term;
break;
}
}
// *** RETURN
return $is_spam;
}
/*____________________________________________________________________________*/
// in spam regex
/*____________________________________________________________________________*/
function lg_spam_preg_match($query)
{
// *** DATA
// globals
global $LGSS;
// return
$is_spam = 0;
// *** MANIPULATE
if ( preg_match($LGSS['regex'], $query, $MATCH) )
{
$is_spam = 1;
$LGSS['spam_trigger'] = htmlspecialchars($MATCH[0]);
#trigger_notice($MATCH); trigger_notice(__FUNCTION__ . ' : ' . $query);
}
// *** RETURN
return $is_spam;
}
/*____________________________________________________________________________*/
// Testbed
/*____________________________________________________________________________*/
if ( 0 )
{
$_TEST[] = "test site <br />";
$_TEST[] = "test site br />";
$_TEST[] = "lo831l";
$_TEST[] = <<<TEST
Very fabulos site!
idiot test
idiot test
[URL=http://idiotest.9999mb.com]idiot test[/URL]
idiot test http://idiotest.9999mb.com
Thank you!
TEST;
$_TEST[] = ' PhenTermine ';
$_TEST[] = 'this is a legit query';
// test clean query
foreach ( $_TEST as $test )
{
$result = lg_clean_query($test);
$test = htmlspecialchars($test);
echo "<p><b>$test</b> --lg_clean_query--> $result</p>";
}
echo "<h2>begin spam test</h2>";
foreach ( $_TEST as $test )
{
$test = lg_clean_query($test);
$result = lg_is_spam($test) ? 'spam' : 'ok';
$test = htmlspecialchars($test);
echo "<p><b>$test</b> --lg_is_spam--> $result --LGSS['spam_trigger']--> {$LGSS['spam_trigger']}</p>";
$LGSS['spam_trigger'] = '';
}
die(__FILE__);
}
/*____________________________________________________________________________*/
?>
CategoryLastGoogle
There are no comments on this page. [Add comment]