#!/usr/bin/php
<?
# @package      hubzero-metrics
# @file         xlogimport_apache
# @author       Nicholas J. Kisseberth <nkissebe@purdue.edu>
# @author       Swaroop Shivarajapura <swaroop@purdue.edu>
# @copyright    Copyright (c) 2011-2013 HUBzero Foundation, LLC.
# @license      http://www.gnu.org/licenses/lgpl-3.0.html LGPLv3
#
# Copyright (c) 2011-2013 HUBzero Foundation, LLC.
#
# This file is part of: The HUBzero(R) Platform for Scientific Collaboration
#
# The HUBzero(R) Platform for Scientific Collaboration (HUBzero) is free
# software: you can redistribute it and/or modify it under the terms of
# the GNU Lesser General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# HUBzero is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# HUBzero is a registered trademark of HUBzero Foundation, LLC.
#
# =========================================================================
# This Script imports apache logs into the web
#
# USAGE: ./xlogimport_apache <filename>
#

error_reporting(E_ALL & ~E_NOTICE);
@ini_set('display_errors','1');

if(!defined('__DIR__')) {
    $fPos = strrpos(__FILE__, "/");
    define("__DIR__", substr(__FILE__, 0, $fPos) . "/");
}

require_once(__DIR__."/../includes/hub_parameters.php");
require_once(__DIR__."/../includes/db_connect.php");
require_once(__DIR__."/../includes/func_misc.php");

$db_hub = db_connect('db_hub');

$filehandle = fopen($_SERVER['argv'][1], "r");

if (!$filehandle) {
	$msg = 'Error opening file: '.$_SERVER['argv'][1].n;
	clean_exit($msg);
}

$unrec = '';

# building excluded IP list
$filtered_ips = gen_exclude_list('ip');
# building excluded URL list
$filtered_urls = gen_exclude_list('url');
# building excluded useragent list
$filtered_useragents = gen_exclude_list('useragent');

$log_pattern_old = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+(.*)\s*$/';

$log_pattern_new = '/^(\d{4}-\d{2}-\d{2})\s+(\d+:\d{2}:\d{2})\s+([\w\-\d]+)\s+([\d]+)\s+(\S+)\s+\"(.+)\"\s+([\-\d]+)\s+([\d]+)\s+([\w\-\.\d]+)\s+\"(.*)\"\s+\"(.*)\"\s+([\w\-]+)\s+([\w\-\d]+)\s+([\w\-\d]+)\s+([\-\d]+)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s+([^_].*)\s*$/';

$debug     = 0;
$prevdatestamp = '';
$sql_ins = 'INSERT INTO '.$metrics_db.'.web (datetime, content, ip, uidNumber, apache_pid, referrer, useragent, joomla_sessionid, site_cookie, auth_type, component_name, view_name, task_name, action_name, item_name) VALUES ';
$cnt = 0;

while(1)
{
	$line = fgets($filehandle);

   	if (feof($filehandle))
   		break;

   	if (preg_match($log_pattern_new, $line, $matches)) {

		$datestamp = $matches[1];
    	$timestamp = $matches[2];
 	    $timezone  = $matches[3];
		$pid       = $matches[4];
	    $user      = $matches[5];
	    $firstline = $matches[6];
	    $return    = $matches[7];
	    $bytes     = $matches[8];
	    $ip        = $matches[9];
	    $referrer  = $matches[10];
	    $useragent = $matches[11];
	    $sslport   = $matches[12];
	    $ts        = $matches[13];
	    $tms       = $matches[14];
	    $uidNumber = $matches[15];
	    $joomla_id = $matches[16];
	    $st_cookie = $matches[17];
	    $auth_type = $matches[18];
	    $comp_name = $matches[19];
	    $view_name = $matches[20];
	    $task_name = $matches[21];
	    $actn_name = $matches[22];
	    $item_name = $matches[23];

	} else if (preg_match($log_pattern_old, $line, $matches)) {

    	$datestamp = $matches[1];
		$timestamp = $matches[2];
		$timezone  = $matches[3];
    	$pid       = '';
		$user      = $matches[4];
		$firstline = $matches[5];
		$return    = $matches[6];
		$bytes     = $matches[7];
		$ip        = $matches[8];
		$referrer  = $matches[9];
		$useragent = $matches[10];
		$sslport   = $matches[11];
		$ts        = $matches[12];
		$tms       = $matches[13];
    	$uidNumber = '';
    	$joomla_id = '';
		$st_cookie = $matches[14];
    	$auth_type = '';
    	$comp_name = '';
    	$view_name = '';
    	$task_name = '';
    	$actn_name = '';
    	$item_name = '';

	} else {

   		$unrec .= 'Unrecognized log format: '.$line;
		continue;

	}
	
	if ((empty($uidNumber)) || ($uidNumber == '-'))
		$uidNumber = 0;

	@list($method, $url, $protocol) = preg_split("/[ ]+/", $firstline);

	if (empty($url))
	{
		$url = $method;
		$method = 'GET';
		$protocol = 'HTTP/1.1';
	}
	else if (empty($protocol))
		$protocol = 'HTTP/1.1';
     
	$url = preg_replace('/\/+/','/',$url); // collapse multiple / to single /

	$bot = 0;
	if ($useragent)
    	$bot = checkbot($db_hub, $useragent);

	if ($return == 200 && $bytes > 0 && (!search_array($ip, $filtered_ips)) && (!search_array($useragent, $filtered_useragents)) && (!search_array($url, $filtered_urls)) && ($method == "GET" || $method == "POST") && (!$bot) )
	{
		if ( ( !preg_match('/\.(gif|jpeg|jpg|png|ps|ico|css|js)$/i', $url)
			&& !preg_match('/^\/templates\//i', $url)
			&& !preg_match('/^\/administrator\//i', $url)
			&& !preg_match('/^\/webdav\//i', $url)
			&& !preg_match('/\/projects\/.+?\/svn\/\!svn\//i', $url) ) 
			|| (preg_match('/^\/resources\//i', $url)) ) 
		{
			$dt_time = $datestamp." ".$timestamp;	
			$sql_ins .= ' ('.
				dbquote($dt_time) . ', ' .
                dbquote($url)  . ', ' .
                dbquote($ip) . ', ' .
                dbquote($uidNumber)  . ', ' .
                dbquote($pid)  . ', ' .
                dbquote($referrer)  . ', ' .
                dbquote($useragent)  . ', ' .
                dbquote($joomla_id)  . ', ' .
                dbquote($st_cookie)  . ', ' .
                dbquote($auth_type)  . ', ' .
                dbquote($comp_name)  . ', ' .
                dbquote($view_name)  . ', ' .
                dbquote($task_name)  . ', ' .
                dbquote($actn_name)  . ', ' .
                dbquote($item_name)  . '), ';

			$cnt++;
		    if ($cnt > 1000) {
              	$cnt = 0;
              	$sql_ins = rtrim($sql_ins, ', ');
              	mysql_exec($db_hub, $sql_ins); // Insert new record in database...
				$sql_ins = 'INSERT INTO '.$metrics_db.'.web (datetime, content, ip, uidNumber, apache_pid, referrer, useragent, joomla_sessionid, site_cookie, auth_type, component_name, view_name, task_name, action_name, item_name) VALUES ';
   			}
		}
	}
}
// taking care of remaining inserts
if ($cnt) {
	$sql_ins = rtrim($sql_ins, ', ');
	mysql_exec($db_hub, $sql_ins); // Insert new record in database...
}

if($unrec)
	print $unrec;

fclose($filehandle);
db_close($db_hub);

?>
