#!/usr/bin/php
<?
# @package      hubzero-metrics
# @file         xlogimport_webhits
# @author       Nicholas J. Kisseberth <nkissebe@purdue.edu>
# @author       Swaroop Shivarajapura <swaroop@purdue.edu>
# @copyright    Copyright (c) 2011-2013 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2011-2013 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# =========================================================================
# This Script imports apache logs into the web and webhits tables
#
# USAGE: ./xlogimport_webhits <filename>
#
# =========================================================================

error_reporting(E_ALL & ~E_NOTICE);
@ini_set('display_errors','1');

if(!defined('__DIR__')) {
    $fPos = strrpos(__FILE__, "/");
    define("__DIR__", substr(__FILE__, 0, $fPos) . "/");
}

require_once(__DIR__."/../includes/hub_parameters.php");
require_once(__DIR__."/../includes/db_connect.php");
require_once(__DIR__."/../includes/func_misc.php");

$db_hub = db_connect('db_hub');

$filehandle = fopen($_SERVER['argv'][1], "r");

if (!$filehandle) {
    $msg = "Error opening file: ".$_SERVER['argv'][1]."\n";
	clean_exit($msg);
}

$unrec = '';

# building excluded IP list
$filtered_ips = gen_exclude_list('ip');
# building excluded URL list
$filtered_urls = gen_exclude_list('url');
# building excluded useragent list
$filtered_useragents = gen_exclude_list('useragent');

$log_pattern_old = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+(.*)\s*$/';

$log_pattern_new = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+([\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+([\-\d]+)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s*$/';

$debug     = 0;
$prevdatestamp = '';
$hits      = 0;

function update_webhits($db_hub, $datestamp, $hits)
{
	global $metrics_db;

	$sql_ins = 'INSERT INTO '.$metrics_db.'.webhits (datetime, hits) VALUES(' . dbquote($datestamp) . ', ' . dbquote($hits) . ')';
	$result = mysql_exec($db_hub, $sql_ins);
}
	
while(1)
{
	$line = fgets($filehandle);

   	if (feof($filehandle))
   		break;

   	if (preg_match($log_pattern_new, $line, $matches)) {

		$datestamp = $matches[1];
    	$timestamp = $matches[2];
 	    $timezone  = $matches[3];
		$pid       = $matches[4];
	    $user      = $matches[5];
	    $firstline = $matches[6];
	    $return    = $matches[7];
	    $bytes     = $matches[8];
	    $ip		   = $matches[9];
	    $referrer  = $matches[10];
	    $useragent = $matches[11];
	    $sslport   = $matches[12];
	    $ts        = $matches[13];
	    $tms       = $matches[14];
	    $uidNumber = $matches[15];
	    $joomla_id = $matches[16];
	    $st_cookie = $matches[17];
	    $auth_type = $matches[18];
	    $comp_name = $matches[19];
	    $view_name = $matches[20];
	    $task_name = $matches[21];
	    $actn_name = $matches[22];
	    $item_name = $matches[23];

	} else if (preg_match($log_pattern_old, $line, $matches)) {

    	$datestamp = $matches[1];
		$timestamp = $matches[2];
		$timezone  = $matches[3];
    	$pid       = '';
		$user      = $matches[4];
		$firstline = $matches[5];
		$return    = $matches[6];
		$bytes     = $matches[7];
		$ip        = $matches[8];
		$referrer  = $matches[9];
		$useragent = $matches[10];
		$sslport   = $matches[11];
		$ts        = $matches[12];
		$tms       = $matches[13];
    	$uidNumber = '';
    	$joomla_id = '';
		$st_cookie = $matches[14];
    	$auth_type = '';
    	$comp_name = '';
    	$view_name = '';
    	$task_name = '';
    	$actn_name = '';
    	$item_name = '';

	} else {

		$unrec .= 'Unrecognized log format: '.$line;
		continue;

	}

	@list($method, $url, $protocol) = preg_split("/[ ]+/", $firstline);

	if (empty($url))
	{
		$url = $method;
		$method = 'GET';
		$protocol = 'HTTP/1.1';
	}
	else if (empty($protocol))
		$protocol = 'HTTP/1.1';
     
	$url = preg_replace('/\/+/','/',$url); // collapse multiple / to single /

	if ($return == 200 && $bytes > 0 && (!search_array($ip, $filtered_ips)) && (!search_array($useragent, $filtered_useragents)) && (!search_array($url, $filtered_urls)) && ($method == "GET" || $method == "POST") )
	{
		$hits++;
	
		# Insert total hit-count for previous day into database...
		if ($prevdatestamp != $datestamp)
		{
			if (!empty($prevdatestamp))
				update_webhits($db_hub, $prevdatestamp, $hits-1);

			$prevdatestamp = $datestamp;
			$hits = 1;
		}
	}
}

# Insert total hit-count for final day into database...
update_webhits($db_hub, $prevdatestamp, $hits);

if($unrec)
	print $unrec;

fclose($filehandle);
db_close($db_hub);

?>
