<?php
/*
 * search.php :: A Web archive search for ml-web-archiver.
 *
 * Version 1.1.0  November 21, 2025
 * Copyright (c) 2025, Ron Guerin <ron@vnetworx.net>
 *
 * Requires: PHP_PCRE, PHP 7.3+
 *
 * ml-web-archiver is Free Software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * ml-web-archiver is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 *
 * If you are not able to view the file COPYING, please write to the
 * Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * to get a copy of the GNU General Public License or to report a
 * possible license violation.
 *
 * @package ml-web-archiver
 * @author Ron Guerin <ron@vnetworx.net>
 * @license http://www.fsf.org/licenses/gpl.html GNU Public License
 * @copyright Copyright &copy; 2025 Ron Guerin
 * @filesource
 * @link http://gothamcode.com/ml-web-archiver
 * @version 1.1.0
 *
*/

error_reporting(E_ALL);
ini_set('display_errors', 1);
define('VERSION', '1.1.0');

if (is_readable('/etc/ml-web-archiver/ml-web-archiver.conf.php'))
	require_once '/etc/ml-web-archiver/ml-web-archiver.conf.php';
define('SYSLOG', (isset($syslog) && ($syslog === TRUE)) ? TRUE : FALSE);

header('Content-Type: text/html; charset=utf-8');

// Get parameters from POST form
$searchterm = isset($_POST['s']) ? trim($_POST['s']) : '';
$listname = isset($_POST['list']) ? trim($_POST['list']) : '';
$searchfields = isset($_POST['q']) ? $_POST['q'] : array();
$page = isset($_GET['page']) ? max(1, intval($_GET['page'])) : 1;

// Find the list's config entry
$listsfile = '/etc/ml-web-archiver/lists';
if (file_exists($listsfile)) {
	foreach (parse_ini_file($listsfile, TRUE) as $address => $listdata) {
		if ((! isset($listdata['description'])) || (! isset($listdata['input'])) || (! isset($listdata['output']))) {
			log_err('Warning: Ignoring malformed list entry, missing '
				.'\'description\', \'input\' or \'output\' in '.$listsfile, LOG_ERR);
			continue;
		}
		if ($listname != $address) continue;
		$description = trim($listdata['description']);
		$archivepath = trim($listdata['output']);
		$archiveurl = trim($listdata['archiveurl']);
		break;
	}
}
else {
	log_err('Error: ml-web-archiver search - No lists defined.');
	exit(1);
}

// Validate inputs - listname and archivepath are required, searchterm can be empty
if (empty($listname) || empty($archivepath)) {
	echo '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'."\n";
	echo '<html><head><title>Search Error</title></head><body bgcolor="#ffffff">';
	echo '<h1>Error</h1><p>Missing list name or archive path.</p></body></html>';
	log_err('Error: ml-web-archiver search Called with missing list name or archive path.');
	exit(1);
}

if (! is_dir($archivepath)) {
	echo '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'."\n";
	echo '<html><head><title>Search Error</title></head><body bgcolor="#ffffff">';
	echo '<h1>Error</h1><p>Invalid archive path.</p></body></html>';
	log_err('Error: ml-web-archiver search - Invalid archive path. "'.$archivepath.'"');
	exit(1);
}

// Default to all fields if none selected
if (empty($searchfields)) {
	$searchfields = array('subject', 'author', 'body');
}

// Search through HTML files
$results = array();

if (! empty($searchterm)) {
	$months = scandir($archivepath);
	foreach ($months as $month) {
		$monthpath = $archivepath.'/'.$month;
		if (($month === '.') || ($month === '..') || (! is_dir($monthpath))) continue;

		$files = scandir($monthpath);

		foreach ($files as $file) {
			if (! preg_match(chr(7).'^(\d{6})\.html$'.chr(7), $file)) continue;

			$filepath = $monthpath.'/'.$file;
			$content = file_get_contents($filepath);
			if ($content === FALSE) continue;

			$subject = $author = $body = $date = '';

			// Get Subject: string, except for a [prefix] if the list uses a prefix
			if (preg_match(chr(7).'<title>(?:\s*\[.*?\]\s*)?(.+?)<\/title>'.chr(7).'is', $content, $m)) {
				$subject = html_entity_decode(strip_tags($m[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
			}

			// Get message author email
			if (preg_match(chr(7).'<b>(.*?)<\/b>\s*<a href="mailto:([^"]+)"'.chr(7).'is', $content, $m)) {
				$author = html_entity_decode(strip_tags($m[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
			}

			// Get message body
			if (preg_match(chr(7).'<!--beginarticle-->\s*<pre>(.*?)<\/pre>\s*<!--endarticle-->'.chr(7).'is', $content, $m)) {
				$body = html_entity_decode(strip_tags($m[1]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
			}

			// Check if search term matches
			$match = FALSE;
			if (in_array('subject', $searchfields) && stripos($subject, $searchterm) !== FALSE) $match = TRUE;
			if (in_array('author', $searchfields) && stripos($author, $searchterm) !== FALSE) $match = TRUE;
			if (in_array('body', $searchfields) && stripos($body, $searchterm) !== FALSE) $match = TRUE;

			if ($match) {
				// Get message date
				if (preg_match(chr(7).'<i>([A-Z][a-z]{2}\s+[A-Z][a-z]{2}\s+\d+\s+\d+:\d+:\d+.*?)<\/i>'
					.chr(7).'s', $content, $m)) {
					$date = trim(strip_tags($m[1]));
				}
				$stamp = strtotime($date);
				$results[] = array('subject' => $subject, 'author' => $author, 'date' => $date,
					'stamp' => ($stamp) ? $stamp : 0, 'month' => $month, 'file' => $file);
			}
		}
	}
}

// Sort by date, newest first
usort($results, function($a, $b) {
	return $b['stamp'] - $a['stamp'];
});

// Pagination
$perpage = 30;
$totalresults = count($results);
$totalpages = max(1, ceil($totalresults / $perpage));
$page = min($page, $totalpages);
$offset = ($page - 1) * $perpage;
$pageresults = array_slice($results, $offset, $perpage);

// Output results
echo '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'."\n";
echo '<html>'."\n";
echo '<head>'."\n";
echo '  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">'."\n";
echo '  <title>'.($searchterm ? '\''.h($searchterm).'\' in ' : '').''.h($listname).'</title>'."\n";
echo '</head>'."\n";
echo '<body bgcolor="#ffffff" text="#000000" link="#0000C0" vlink="#800080">'."\n";

// Search form
echo '<form action="'.h($_SERVER['REQUEST_URI']).'" method="post">'."\n";
echo '<input type="hidden" name="list" value="'.h($listname).'">'."\n";
echo '<table width="100%" border="0" cellpadding="0" cellspacing="0">'."\n";
echo '<tr><td align="left">Search:&nbsp;<input type="text" name="s" value="'
	.h($searchterm).'" size="25" maxlength="100">';
echo '<input type="submit" value="Search"><input type="reset" value="Reset"><br>'."\n";
echo '[<input type="checkbox" name="q[]" value="subject"'
	.(in_array('subject', $searchfields) ? ' checked' : '').'>]&nbsp;Subjects&nbsp;';
echo '[<input type="checkbox" name="q[]" value="author"'
	.(in_array('author', $searchfields) ? ' checked' : '').'>]&nbsp;Authors&nbsp;';
echo '[<input type="checkbox" name="q[]" value="body"'
	.(in_array('body', $searchfields) ? ' checked' : '').'>]&nbsp;Bodies&nbsp;';
echo 'for&nbsp;list&nbsp;\''.h($listname).'\''."\n";
echo '</td></tr></table>'."\n";
echo '</form>'."\n";
echo '<hr>'."\n";

// Results
if (! empty($searchterm)) {
	echo '<h2>Search Results</h2>'."\n";
	echo '<p>Found '.$totalresults.' result'.($totalresults != 1 ? 's' : '').' for \''.h($searchterm).'\'</p>'."\n";

	if ($totalresults > 0) {
		$buildurl = function($p) use ($searchterm, $listname, $archivepath, $searchfields) {
			return '?s='.urlencode($searchterm).'&list='.urlencode($listname).'&archivepath='.urlencode($archivepath)
				.'&'.http_build_query(array('q' => $searchfields)).'&page='.$p;
		};

		echo '<pre>';
		if ($page > 1) echo ' <a href="'.h($buildurl($page - 1)).'">Previous</a> ';
		if ($page < $totalpages) echo ' <a href="'.h($buildurl($page + 1)).'">Next</a> ';
		if ($totalpages > 1) echo ' <a href="'.h($buildurl($totalpages)).'">Last</a> ';
		echo '  Page '.$page.' of '.$totalpages."\n";

		$counter = $offset + 1;
		foreach ($pageresults as $result) {
			$link = h($archiveurl).'/'.h($result['month']).'/'.h($result['file']);
			echo '  '.sprintf('%3d', $counter).'. '.h(substr($result['date'], 0, 10)).'  ';
			echo '<a href="'.h($link).'">'.h($result['subject']).'</a>';
			echo str_repeat(' ', max(1, 60 - strlen($result['subject'])));
			echo '<i>'.h($result['author']).'</i>'."\n";
			$counter++;
		}

		if ($page > 1) echo ' <a href="'.h($buildurl($page - 1)).'">Previous</a> ';
		if ($page < $totalpages) echo ' <a href="'.h($buildurl($page + 1)).'">Next</a> ';
		if ($totalpages > 1) echo ' <a href="'.h($buildurl($totalpages)).'">Last</a> ';
		echo '</pre>'."\n";
	}
}
echo '<p><a href="'.h($archiveurl).'">Return to '.h($description).' archive</a></p>'."\n";
echo '</body>'."\n";
echo '</html>'."\n";
exit;


####################################################################################################################################
####################################################################################################################################


function h($text) {
	return htmlspecialchars($text, ENT_QUOTES, 'UTF-8');
}

function log_err($msg, $prio=LOG_INFO) {
	if (SYSLOG) syslog($prio, $msg);
	$stderr = @fopen('php://stderr', 'w');
	@fwrite($stderr, $msg);
	@fclose($stderr);
}
