Last Google Spam Filter

Like a twelve year-old girl on myspace, Last Google has been a target to the unwanted attentions of vile, disgusting predators almost since its inception in 2004. Necessary, grudging steps have been taken to control the scourge, but the struggle has evolved into the inevitable arms race between the diseased corps of the Windows-fueled botnet and one lethargic developer pursuing a hobby of sorts in his free-time.

This page documents some of the steps I've taken to keep the scourge in check. My efforts on this subject are also chronicled from time to time in the lastgeist blog.

If you have any comments, inquiries, or suggestions, feel free to contact me at my email address.

Spam Filter Script

This is the latest php script I am using to catch form spam
<?php

/***  DOCUMENTATION LAYER

Last Google Query Validation Library

Last Update: Jul 2007
Author: Tom at klenwell@gmail.com

FUNCTIONS
    lg_clean_query($query)
    lg_is_spam($string)
    lg_on_spam_list($query)
    lg_spam_preg_match($query)
    lg_spam_redirect()

NOTES

______________________________________________________________________________*/


// GLOBALS (LGSS: Last Google Site Security)
$LGSS['spam_trigger'] = '';
$LGSS['default_redirect'] = 'http://klenwell.net/is/SpamTrap';

$LGSS['TERM'] = array
    (
        'hydrocodone', 'phentermine',       // spam terms
        '[url'      // forum spam
    );

$LGSS['regex'] = '%'        // begin regex
    . '(lo\d+)'         // e.g. lo1239 -- something to do with loopbacks?
    . '|(<\s*br[^\>]*>)'        // don't need line breaks
    . '|(viagra)'           // common spam terms
    . '%iU';            // end regex

// lg_clean_query
/*____________________________________________________________________________*/
function lg_clean_query($query)
{
// *** DATA

    // return
    $clean_query = '';

// *** MANIPULATE

    // trim
    $query = trim($query);

    // chop at newline
    $nl_pos = strpos($query, "\n");
    if ( $nl_pos > 0 ) $query = substr($query, 0, $nl_pos);
    $clean_query = strtolower($query);
   
    // clean up extra spaces
    $clean_query = preg_replace('%\s+%', ' ', $clean_query);
    $clean_query = trim($clean_query);


// *** RETURN

    return $clean_query;
}
/*____________________________________________________________________________*/



// lg_is_spam
/*____________________________________________________________________________*/
function lg_is_spam($string)
{
// *** DATA

    // Global
    global $PROJECT;

    // Return
    $is_spam = 0;


// *** MANIPULATE

    // sanity check
    if ( empty($string) ) return 0;

    // * Cases (add as needed)
    // css-hidden text field (bot bait) source: http://it.slashdot.org/comments.pl?sid=247229&cid=19797293
    // note: not actual field name
    if ( $_POST['bot_trap'] ) $is_spam = 1;

    // check spam list
    elseif ( lg_on_spam_list($string) ) $is_spam = 1;
   
    // regex check
    elseif ( lg_spam_preg_match($string) ) $is_spam = 1;
   
    // too many urls
    elseif ( substr_count($string, 'http://') > 1 ) $is_spam = 1;
   
    // endline signal bot in this case (via spoofed blogspot form)
    // lg_clean_query should fix this
    #elseif ( strpos($string,"\n") !== FALSE ) $is_spam = 1;
   
    // clean
  else $is_spam = 0;

// *** RETURN

    return $is_spam;

} // end Fx
/*____________________________________________________________________________*/


// is spam term
/*____________________________________________________________________________*/
function lg_on_spam_list($query)
{
// *** DATA

    // globals
    global $LGSS;
   
    // internal
    $_QUERY = array();

    // return
    $is_spam = 0;


// *** MANIPULATE

    // split query
    $query = trim($query);
    $_QUERY = explode(' ', $query);
   
    // cycle
    foreach ( $_QUERY as $_term )
    {
        if ( in_array($_term, $LGSS['TERM']) )
        {
            #trigger_notice(__FUNCTION__ . ' ' . $_term);
            $is_spam = 1;
            $LGSS['spam_trigger'] = $_term;
            break;
        }
    }

// *** RETURN

    return $is_spam;
}
/*____________________________________________________________________________*/


// in spam regex
/*____________________________________________________________________________*/
function lg_spam_preg_match($query)
{
// *** DATA

    // globals
    global $LGSS;

    // return
    $is_spam = 0;

// *** MANIPULATE

    if ( preg_match($LGSS['regex'], $query, $MATCH) )
    {
        $is_spam = 1;
        $LGSS['spam_trigger'] = htmlspecialchars($MATCH[0]);
        #trigger_notice($MATCH); trigger_notice(__FUNCTION__ . ' : ' . $query);
    }

// *** RETURN

    return $is_spam;
}
/*____________________________________________________________________________*/


// Testbed
/*____________________________________________________________________________*/

if ( 0 )
{
    $_TEST[] = "test site <br />";
    $_TEST[] = "test site br />";
    $_TEST[] = "lo831l";
    $_TEST[] = <<<TEST
Very fabulos site!
idiot test
idiot test
[URL=http://idiotest.9999mb.com]idiot test[/URL]
idiot test http://idiotest.9999mb.com
Thank you!
TEST;
    $_TEST[] = ' PhenTermine ';
    $_TEST[] = 'this is a legit query';
   
    // test clean query
    foreach ( $_TEST as $test )
    {
        $result = lg_clean_query($test);
        $test = htmlspecialchars($test);
        echo "<p><b>$test</b> --lg_clean_query--> $result</p>";
    }
   
    echo "<h2>begin spam test</h2>";

    foreach ( $_TEST as $test )
    {
        $test = lg_clean_query($test);
        $result = lg_is_spam($test) ? 'spam' : 'ok';
        $test = htmlspecialchars($test);
        echo "<p><b>$test</b> --lg_is_spam--> $result --LGSS['spam_trigger']--> {$LGSS['spam_trigger']}</p>";
        $LGSS['spam_trigger'] = '';
    }
    die(__FILE__);
}

/*____________________________________________________________________________*/

?>



CategoryLastGoogle

There are no comments on this page. [Add comment]

Valid XHTML 1.0 Transitional :: Valid CSS :: Powered by WikkaWiki