293 lines
11 KiB
PHP
293 lines
11 KiB
PHP
<?php
|
|
|
|
/**
|
|
* @file classes/tasks/PKPUsageStatsLoader.php
|
|
*
|
|
* Copyright (c) 2022 Simon Fraser University
|
|
* Copyright (c) 2022 John Willinsky
|
|
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
|
*
|
|
* @class PKPUsageStatsLoader
|
|
*
|
|
* @ingroup tasks
|
|
*
|
|
* @brief Scheduled task to extract transform and load usage statistics data into database.
|
|
*/
|
|
|
|
namespace PKP\task;
|
|
|
|
use APP\core\Application;
|
|
use APP\core\Services;
|
|
use APP\statistics\StatisticsHelper;
|
|
use Illuminate\Support\Facades\Bus;
|
|
use PKP\file\FileManager;
|
|
use PKP\jobs\statistics\CompileMonthlyMetrics;
|
|
use PKP\scheduledTask\ScheduledTaskHelper;
|
|
use PKP\site\Site;
|
|
use Throwable;
|
|
|
|
abstract class PKPUsageStatsLoader extends FileLoader
|
|
{
|
|
/**
|
|
* If the log files should be automatically moved to te stage folder.
|
|
* This is the case for daily log file processing.
|
|
* This is not the case if the whole month is reprocessed - all log files for the given month should be manually placed in the stage folder.
|
|
*/
|
|
private bool $autoStage;
|
|
|
|
/** List of months the processed daily log files are from, to consider for monthly aggregation */
|
|
private array $months = [];
|
|
|
|
/** List of log files that needs to be processed within this scheduled task, and the jobs needs to be chained for. */
|
|
private array $logFiles = [];
|
|
|
|
/**
|
|
* Constructor.
|
|
*/
|
|
public function __construct(array $args)
|
|
{
|
|
$this->autoStage = true;
|
|
|
|
// if log files for a whole month should be reprocessed,
|
|
// the month is given as parameter
|
|
if (!empty($args)) {
|
|
$reprocessMonth = current($args);
|
|
$reprocessFiles = $this->getStagedFilesByMonth($reprocessMonth);
|
|
$this->setOnlyConsiderFiles($reprocessFiles);
|
|
$this->autoStage = false;
|
|
}
|
|
|
|
// shall the archived log files be compressed
|
|
$site = Application::get()->getRequest()->getSite();
|
|
if ($site->getData('compressStatsLogs')) {
|
|
$this->setCompressArchives(true);
|
|
}
|
|
|
|
// Define the base filesystem path.
|
|
$basePath = StatisticsHelper::getUsageStatsDirPath();
|
|
$args[0] = $basePath;
|
|
parent::__construct($args);
|
|
|
|
$this->checkFolderStructure(true);
|
|
}
|
|
|
|
/**
|
|
* @copydoc FileLoader::getName()
|
|
*/
|
|
public function getName(): string
|
|
{
|
|
return __('admin.scheduledTask.usageStatsLoader');
|
|
}
|
|
|
|
/**
|
|
* Get the jobs needed to process a usage stats log file and compile the stats.
|
|
* The jobs have to be in the right execution order.
|
|
*
|
|
* @return BaseJob[]
|
|
*/
|
|
abstract protected function getFileJobs(string $filePath, Site $site): array;
|
|
|
|
/**
|
|
* @copydoc FileLoader::executeActions()
|
|
*/
|
|
protected function executeActions(): bool
|
|
{
|
|
// It's possible that the processing directory has files that
|
|
// were being processed but the php process was stopped before
|
|
// finishing the processing, or there may be a concurrent process running.
|
|
// Warn the user if this is the case.
|
|
$processingDirFiles = glob($this->getProcessingPath() . '/' . '*');
|
|
$processingDirError = is_array($processingDirFiles) && count($processingDirFiles);
|
|
// If the processing directory is not empty (and this is not the reprocessing of the older log files)
|
|
// log that message
|
|
if ($processingDirError && !empty($this->getOnlyConsiderFiles())) {
|
|
$this->addExecutionLogEntry(__('admin.scheduledTask.usageStatsLoader.processingPathNotEmpty', ['directory' => $this->getProcessingPath()]), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
|
|
}
|
|
if ($this->autoStage) {
|
|
$this->autoStage();
|
|
}
|
|
$processFilesResult = parent::executeActions();
|
|
if (!$processFilesResult) {
|
|
return false;
|
|
}
|
|
|
|
$site = Application::get()->getRequest()->getSite();
|
|
$jobs = [];
|
|
foreach ($this->logFiles as $filePath) {
|
|
$jobsPerFile = $this->getFileJobs($filePath, $site);
|
|
$jobs = array_merge($jobs, $jobsPerFile);
|
|
}
|
|
foreach ($this->months as $month) {
|
|
$compileMonthlyMetricsJob = new CompileMonthlyMetrics($month, $site);
|
|
$jobs = array_merge($jobs, [$compileMonthlyMetricsJob]);
|
|
}
|
|
// Bus::chain() cannot accept an empty array
|
|
if (!empty($jobs)) {
|
|
Bus::chain($jobs)
|
|
->catch(function (Throwable $e) {
|
|
})
|
|
->dispatch();
|
|
|
|
$this->addExecutionLogEntry(__(
|
|
'admin.scheduledTask.usageStatsLoader.jobDispatched'
|
|
), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_NOTICE);
|
|
}
|
|
|
|
return (!$processingDirError);
|
|
}
|
|
|
|
/**
|
|
* Check if the log file's date is later than the first installation of the new log file format,
|
|
* so that the log file can be processed.
|
|
*/
|
|
protected function isDateValid(string $loadId): bool
|
|
{
|
|
$date = substr($loadId, -12, 8);
|
|
// Get the date when the version that uses the new log file format (and COUNTER R5) is installed.
|
|
// Only the log files later than that day can be (regularly) processed here.
|
|
$statsService = Services::get('sushiStats');
|
|
$dateR5Installed = date('Ymd', strtotime($statsService->getEarliestDate()));
|
|
if ($date < $dateR5Installed) {
|
|
// the log file is in old log file format
|
|
// return the file to staging and
|
|
// log the error
|
|
$this->addExecutionLogEntry(__(
|
|
'admin.scheduledTask.usageStatsLoader.veryOldLogFile',
|
|
['file' => $loadId]
|
|
), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Check if stats for the log file's month do not already exist.
|
|
* Return true if they do not exist, so that log file can be processed.
|
|
* Else, return the file to staging and log the error that
|
|
* the CLI script for reprocessing should be called.
|
|
* If the log files of the month are being reprocessed,
|
|
* the CLI reprocessing script will first remove all the stats for the month,
|
|
* so that this function will return true in that case.
|
|
*/
|
|
protected function isMonthValid(string $loadId, string $month): bool
|
|
{
|
|
$currentMonth = date('Ym');
|
|
$lastMonth = date('Ym', strtotime('last month'));
|
|
$site = Application::get()->getRequest()->getSite();
|
|
// If the daily metrics are not kept, and this is not the current month (which is kept in the DB)
|
|
// the CLI script to reprocess the whole month should be called.
|
|
if (!$site->getData('keepDailyUsageStats') && $month != $currentMonth && $month != $lastMonth) {
|
|
$statsService = Services::get('sushiStats');
|
|
$counterMonthExists = $statsService->monthExists($month);
|
|
$geoService = Services::get('geoStats');
|
|
$geoMonthExists = $geoService->monthExists($month);
|
|
if ($counterMonthExists || $geoMonthExists) {
|
|
$this->addExecutionLogEntry(__(
|
|
'admin.scheduledTask.usageStatsLoader.monthExists',
|
|
['file' => $loadId]
|
|
), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Add the log file's month to the list of months to be considered for the
|
|
* stats aggregation after the current log files are processed.
|
|
*/
|
|
protected function considerMonthForStatsAggregation(string $month): void
|
|
{
|
|
if (!in_array($month, $this->months)) {
|
|
$this->months[] = $month;
|
|
}
|
|
}
|
|
/**
|
|
* @copydoc FileLoader::processFile()
|
|
* The file name MUST be of form usage_events_YYYYMMDD.log
|
|
* If the function successfully finishes, the file will be archived.
|
|
*/
|
|
protected function processFile(string $filePath): bool|int
|
|
{
|
|
$loadId = basename($filePath);
|
|
$month = substr($loadId, -12, 6);
|
|
// if the file is not being reprocessed using the CLI tool
|
|
if (!in_array($loadId, $this->getOnlyConsiderFiles())) {
|
|
// Check if the log file is an old log file and if the stats for the month already exist
|
|
if (!$this->isDateValid($loadId) || !$this->isMonthValid($loadId, $month)) {
|
|
return self::FILE_LOADER_RETURN_TO_STAGING;
|
|
}
|
|
}
|
|
// Add this log file to the list, so that all jobs, for all files can be chained.
|
|
$this->logFiles[] = $loadId;
|
|
// Add this log file's month to the list of months the stats need to be aggregated for.
|
|
$this->considerMonthForStatsAggregation($month);
|
|
return self::FILE_LOADER_RETURN_TO_DISPATCH;
|
|
}
|
|
|
|
/**
|
|
* Auto stage usage stats log files, also moving files that
|
|
* might be in processing folder to stage folder.
|
|
*/
|
|
protected function autoStage(): void
|
|
{
|
|
// Copy all log files to stage directory, except the current day one.
|
|
$fileManager = new FileManager();
|
|
$logFiles = [];
|
|
$logsDirFiles = glob($this->getUsageEventLogsPath() . '/*');
|
|
if (is_array($logsDirFiles)) {
|
|
$logFiles = array_merge($logFiles, $logsDirFiles);
|
|
}
|
|
// It's possible that the processing directory have files that
|
|
// were being processed but the php process was stopped before
|
|
// finishing the processing. Just copy them to the stage directory too.
|
|
$processingDirFiles = glob($this->getProcessingPath() . '/*');
|
|
if (is_array($processingDirFiles)) {
|
|
$logFiles = array_merge($logFiles, $processingDirFiles);
|
|
}
|
|
|
|
foreach ($logFiles as $filePath) {
|
|
if ($fileManager->fileExists($filePath)) {
|
|
$filename = pathinfo($filePath, PATHINFO_BASENAME);
|
|
$currentDayFilename = $this->getUsageEventCurrentDayLogName();
|
|
if ($filename == $currentDayFilename) {
|
|
continue;
|
|
}
|
|
$this->moveFile(pathinfo($filePath, PATHINFO_DIRNAME), $this->getStagePath(), $filename);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get staged usage log files belonging to a month, that should be reprocessed
|
|
*/
|
|
protected function getStagedFilesByMonth(string $month): array
|
|
{
|
|
$files = [];
|
|
$stagePath = StatisticsHelper::getUsageStatsDirPath() . '/' . self::FILE_LOADER_PATH_STAGING;
|
|
$stageDir = opendir($stagePath);
|
|
while ($filename = readdir($stageDir)) {
|
|
if (str_starts_with($filename, 'usage_events_' . $month)) {
|
|
$files[] = $filename;
|
|
}
|
|
}
|
|
return $files;
|
|
}
|
|
|
|
/**
|
|
* Get the usage event logs directory path.
|
|
*/
|
|
protected function getUsageEventLogsPath(): string
|
|
{
|
|
return StatisticsHelper::getUsageStatsDirPath() . '/usageEventLogs';
|
|
}
|
|
|
|
/**
|
|
* Get current day usage event log name.
|
|
*/
|
|
protected function getUsageEventCurrentDayLogName(): string
|
|
{
|
|
return 'usage_events_' . date('Ymd') . '.log';
|
|
}
|
|
}
|