#!/usr/bin/php
<?
# @package      hubzero-metrics
# @file         xlogimport_identify_bots
# @author       Swaroop Shivarajapura <swaroop@purdue.edu>
# @copyright    Copyright (c) 2011-2013 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2011-2013 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# =========================================================================
# This Script reads the apache log and populates the bot_useragents table
#
# USAGE: ./xlogfix_identify_bots <filename>
#

error_reporting(E_ALL & ~E_NOTICE);
@ini_set('display_errors','1');

if(!defined('__DIR__')) {
    $fPos = strrpos(__FILE__, "/");
    define("__DIR__", substr(__FILE__, 0, $fPos) . "/");
}

require_once(__DIR__."/../includes/hub_parameters.php");
require_once(__DIR__."/../includes/db_connect.php");
require_once(__DIR__."/../includes/func_misc.php");

$db_hub = db_connect('db_hub');

$filehandle = fopen($_SERVER['argv'][1], "r");

if (!$filehandle) {
    $msg = 'Error opening file: '.$_SERVER['argv'][1].n;
    clean_exit($msg);
}

$unrec = '';

$filters = array("feedfetcher","msnbot","gsa-crawler","googlebot","yandex","spider","bot","search","crawl","archive","harvest","slurp","feed","nutch","robot","fetch","findlinks");

$log_pattern_old = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+(.*)\s*$/';

$log_pattern_new = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+([\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+([\-\d]+)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s*$/';

$useragent_strings = array();
$cnt = 1;
while(1)
{
	$line = fgets($filehandle);

 	if (feof($filehandle))
   		break;

   	if (preg_match($log_pattern_new, $line, $matches)) {

		$datestamp = $matches[1];
    	$timestamp = $matches[2];
 	    $timezone  = $matches[3];
		$pid       = $matches[4];
	    $user      = $matches[5];
	    $firstline = $matches[6];
	    $return    = $matches[7];
	    $bytes     = $matches[8];
	    $ip        = $matches[9];
	    $referrer  = $matches[10];
	    $useragent = $matches[11];
	    $sslport   = $matches[12];
	    $ts        = $matches[13];
	    $tms       = $matches[14];
	    $uidNumber = $matches[15];
	    $joomla_id = $matches[16];
	    $st_cookie = $matches[17];
	    $auth_type = $matches[18];
	    $comp_name = $matches[19];
	    $view_name = $matches[20];
	    $task_name = $matches[21];
	    $actn_name = $matches[22];
	    $item_name = $matches[23];

	} else if (preg_match($log_pattern_old, $line, $matches)) {

    	$datestamp = $matches[1];
		$timestamp = $matches[2];
		$timezone  = $matches[3];
    	$pid       = '';
		$user      = $matches[4];
		$firstline = $matches[5];
		$return    = $matches[6];
		$bytes     = $matches[7];
		$ip        = $matches[8];
		$referrer  = $matches[9];
		$useragent = $matches[10];
		$sslport   = $matches[11];
		$ts        = $matches[12];
		$tms       = $matches[13];
    	$uidNumber = '';
    	$joomla_id = '';
		$st_cookie = $matches[14];
    	$auth_type = '';
    	$comp_name = '';
    	$view_name = '';
    	$task_name = '';
    	$actn_name = '';
    	$item_name = '';

	} else {

		$unrec .= 'Unrecognized log format: '.$line;
		continue;

	}

	if ($useragent) {
		array_push($useragent_strings, $useragent);
		$cnt++;
	}
	if ($cnt > 1000) {
		$cnt = 1;
		$useragent_strings = array_unique($useragent_strings);
	}
}


$useragent_strings = array_unique($useragent_strings);
foreach($useragent_strings as $agent) {
    foreach ($filters as $filter) {
        if (stripos($agent, $filter) !== false) {
            $sql_ins = 'INSERT IGNORE INTO '.$metrics_db.'.bot_useragents (useragent) VALUES ('.dbquote($agent).')';
            mysql_exec($db_hub, $sql_ins);
        }
    }
}

$sql = 'DELETE FROM '.$metrics_db.'.bot_useragents WHERE (useragent LIKE "%searchtool%" OR useragent LIKE "% feed/%")';
mysql_exec($db_hub, $sql);

if ($unrec)
	print $unrec;

fclose($filehandle);
db_close($db_hub);

?>
