first commit
This commit is contained in:
@@ -0,0 +1,292 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file classes/tasks/PKPUsageStatsLoader.php
|
||||
*
|
||||
* Copyright (c) 2022 Simon Fraser University
|
||||
* Copyright (c) 2022 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class PKPUsageStatsLoader
|
||||
*
|
||||
* @ingroup tasks
|
||||
*
|
||||
* @brief Scheduled task to extract transform and load usage statistics data into database.
|
||||
*/
|
||||
|
||||
namespace PKP\task;
|
||||
|
||||
use APP\core\Application;
|
||||
use APP\core\Services;
|
||||
use APP\statistics\StatisticsHelper;
|
||||
use Illuminate\Support\Facades\Bus;
|
||||
use PKP\file\FileManager;
|
||||
use PKP\jobs\statistics\CompileMonthlyMetrics;
|
||||
use PKP\scheduledTask\ScheduledTaskHelper;
|
||||
use PKP\site\Site;
|
||||
use Throwable;
|
||||
|
||||
abstract class PKPUsageStatsLoader extends FileLoader
|
||||
{
|
||||
/**
|
||||
* If the log files should be automatically moved to te stage folder.
|
||||
* This is the case for daily log file processing.
|
||||
* This is not the case if the whole month is reprocessed - all log files for the given month should be manually placed in the stage folder.
|
||||
*/
|
||||
private bool $autoStage;
|
||||
|
||||
/** List of months the processed daily log files are from, to consider for monthly aggregation */
|
||||
private array $months = [];
|
||||
|
||||
/** List of log files that needs to be processed within this scheduled task, and the jobs needs to be chained for. */
|
||||
private array $logFiles = [];
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*/
|
||||
public function __construct(array $args)
|
||||
{
|
||||
$this->autoStage = true;
|
||||
|
||||
// if log files for a whole month should be reprocessed,
|
||||
// the month is given as parameter
|
||||
if (!empty($args)) {
|
||||
$reprocessMonth = current($args);
|
||||
$reprocessFiles = $this->getStagedFilesByMonth($reprocessMonth);
|
||||
$this->setOnlyConsiderFiles($reprocessFiles);
|
||||
$this->autoStage = false;
|
||||
}
|
||||
|
||||
// shall the archived log files be compressed
|
||||
$site = Application::get()->getRequest()->getSite();
|
||||
if ($site->getData('compressStatsLogs')) {
|
||||
$this->setCompressArchives(true);
|
||||
}
|
||||
|
||||
// Define the base filesystem path.
|
||||
$basePath = StatisticsHelper::getUsageStatsDirPath();
|
||||
$args[0] = $basePath;
|
||||
parent::__construct($args);
|
||||
|
||||
$this->checkFolderStructure(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @copydoc FileLoader::getName()
|
||||
*/
|
||||
public function getName(): string
|
||||
{
|
||||
return __('admin.scheduledTask.usageStatsLoader');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the jobs needed to process a usage stats log file and compile the stats.
|
||||
* The jobs have to be in the right execution order.
|
||||
*
|
||||
* @return BaseJob[]
|
||||
*/
|
||||
abstract protected function getFileJobs(string $filePath, Site $site): array;
|
||||
|
||||
/**
|
||||
* @copydoc FileLoader::executeActions()
|
||||
*/
|
||||
protected function executeActions(): bool
|
||||
{
|
||||
// It's possible that the processing directory has files that
|
||||
// were being processed but the php process was stopped before
|
||||
// finishing the processing, or there may be a concurrent process running.
|
||||
// Warn the user if this is the case.
|
||||
$processingDirFiles = glob($this->getProcessingPath() . '/' . '*');
|
||||
$processingDirError = is_array($processingDirFiles) && count($processingDirFiles);
|
||||
// If the processing directory is not empty (and this is not the reprocessing of the older log files)
|
||||
// log that message
|
||||
if ($processingDirError && !empty($this->getOnlyConsiderFiles())) {
|
||||
$this->addExecutionLogEntry(__('admin.scheduledTask.usageStatsLoader.processingPathNotEmpty', ['directory' => $this->getProcessingPath()]), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
|
||||
}
|
||||
if ($this->autoStage) {
|
||||
$this->autoStage();
|
||||
}
|
||||
$processFilesResult = parent::executeActions();
|
||||
if (!$processFilesResult) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$site = Application::get()->getRequest()->getSite();
|
||||
$jobs = [];
|
||||
foreach ($this->logFiles as $filePath) {
|
||||
$jobsPerFile = $this->getFileJobs($filePath, $site);
|
||||
$jobs = array_merge($jobs, $jobsPerFile);
|
||||
}
|
||||
foreach ($this->months as $month) {
|
||||
$compileMonthlyMetricsJob = new CompileMonthlyMetrics($month, $site);
|
||||
$jobs = array_merge($jobs, [$compileMonthlyMetricsJob]);
|
||||
}
|
||||
// Bus::chain() cannot accept an empty array
|
||||
if (!empty($jobs)) {
|
||||
Bus::chain($jobs)
|
||||
->catch(function (Throwable $e) {
|
||||
})
|
||||
->dispatch();
|
||||
|
||||
$this->addExecutionLogEntry(__(
|
||||
'admin.scheduledTask.usageStatsLoader.jobDispatched'
|
||||
), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_NOTICE);
|
||||
}
|
||||
|
||||
return (!$processingDirError);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the log file's date is later than the first installation of the new log file format,
|
||||
* so that the log file can be processed.
|
||||
*/
|
||||
protected function isDateValid(string $loadId): bool
|
||||
{
|
||||
$date = substr($loadId, -12, 8);
|
||||
// Get the date when the version that uses the new log file format (and COUNTER R5) is installed.
|
||||
// Only the log files later than that day can be (regularly) processed here.
|
||||
$statsService = Services::get('sushiStats');
|
||||
$dateR5Installed = date('Ymd', strtotime($statsService->getEarliestDate()));
|
||||
if ($date < $dateR5Installed) {
|
||||
// the log file is in old log file format
|
||||
// return the file to staging and
|
||||
// log the error
|
||||
$this->addExecutionLogEntry(__(
|
||||
'admin.scheduledTask.usageStatsLoader.veryOldLogFile',
|
||||
['file' => $loadId]
|
||||
), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if stats for the log file's month do not already exist.
|
||||
* Return true if they do not exist, so that log file can be processed.
|
||||
* Else, return the file to staging and log the error that
|
||||
* the CLI script for reprocessing should be called.
|
||||
* If the log files of the month are being reprocessed,
|
||||
* the CLI reprocessing script will first remove all the stats for the month,
|
||||
* so that this function will return true in that case.
|
||||
*/
|
||||
protected function isMonthValid(string $loadId, string $month): bool
|
||||
{
|
||||
$currentMonth = date('Ym');
|
||||
$lastMonth = date('Ym', strtotime('last month'));
|
||||
$site = Application::get()->getRequest()->getSite();
|
||||
// If the daily metrics are not kept, and this is not the current month (which is kept in the DB)
|
||||
// the CLI script to reprocess the whole month should be called.
|
||||
if (!$site->getData('keepDailyUsageStats') && $month != $currentMonth && $month != $lastMonth) {
|
||||
$statsService = Services::get('sushiStats');
|
||||
$counterMonthExists = $statsService->monthExists($month);
|
||||
$geoService = Services::get('geoStats');
|
||||
$geoMonthExists = $geoService->monthExists($month);
|
||||
if ($counterMonthExists || $geoMonthExists) {
|
||||
$this->addExecutionLogEntry(__(
|
||||
'admin.scheduledTask.usageStatsLoader.monthExists',
|
||||
['file' => $loadId]
|
||||
), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the log file's month to the list of months to be considered for the
|
||||
* stats aggregation after the current log files are processed.
|
||||
*/
|
||||
protected function considerMonthForStatsAggregation(string $month): void
|
||||
{
|
||||
if (!in_array($month, $this->months)) {
|
||||
$this->months[] = $month;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @copydoc FileLoader::processFile()
|
||||
* The file name MUST be of form usage_events_YYYYMMDD.log
|
||||
* If the function successfully finishes, the file will be archived.
|
||||
*/
|
||||
protected function processFile(string $filePath): bool|int
|
||||
{
|
||||
$loadId = basename($filePath);
|
||||
$month = substr($loadId, -12, 6);
|
||||
// if the file is not being reprocessed using the CLI tool
|
||||
if (!in_array($loadId, $this->getOnlyConsiderFiles())) {
|
||||
// Check if the log file is an old log file and if the stats for the month already exist
|
||||
if (!$this->isDateValid($loadId) || !$this->isMonthValid($loadId, $month)) {
|
||||
return self::FILE_LOADER_RETURN_TO_STAGING;
|
||||
}
|
||||
}
|
||||
// Add this log file to the list, so that all jobs, for all files can be chained.
|
||||
$this->logFiles[] = $loadId;
|
||||
// Add this log file's month to the list of months the stats need to be aggregated for.
|
||||
$this->considerMonthForStatsAggregation($month);
|
||||
return self::FILE_LOADER_RETURN_TO_DISPATCH;
|
||||
}
|
||||
|
||||
/**
|
||||
* Auto stage usage stats log files, also moving files that
|
||||
* might be in processing folder to stage folder.
|
||||
*/
|
||||
protected function autoStage(): void
|
||||
{
|
||||
// Copy all log files to stage directory, except the current day one.
|
||||
$fileManager = new FileManager();
|
||||
$logFiles = [];
|
||||
$logsDirFiles = glob($this->getUsageEventLogsPath() . '/*');
|
||||
if (is_array($logsDirFiles)) {
|
||||
$logFiles = array_merge($logFiles, $logsDirFiles);
|
||||
}
|
||||
// It's possible that the processing directory have files that
|
||||
// were being processed but the php process was stopped before
|
||||
// finishing the processing. Just copy them to the stage directory too.
|
||||
$processingDirFiles = glob($this->getProcessingPath() . '/*');
|
||||
if (is_array($processingDirFiles)) {
|
||||
$logFiles = array_merge($logFiles, $processingDirFiles);
|
||||
}
|
||||
|
||||
foreach ($logFiles as $filePath) {
|
||||
if ($fileManager->fileExists($filePath)) {
|
||||
$filename = pathinfo($filePath, PATHINFO_BASENAME);
|
||||
$currentDayFilename = $this->getUsageEventCurrentDayLogName();
|
||||
if ($filename == $currentDayFilename) {
|
||||
continue;
|
||||
}
|
||||
$this->moveFile(pathinfo($filePath, PATHINFO_DIRNAME), $this->getStagePath(), $filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get staged usage log files belonging to a month, that should be reprocessed
|
||||
*/
|
||||
protected function getStagedFilesByMonth(string $month): array
|
||||
{
|
||||
$files = [];
|
||||
$stagePath = StatisticsHelper::getUsageStatsDirPath() . '/' . self::FILE_LOADER_PATH_STAGING;
|
||||
$stageDir = opendir($stagePath);
|
||||
while ($filename = readdir($stageDir)) {
|
||||
if (str_starts_with($filename, 'usage_events_' . $month)) {
|
||||
$files[] = $filename;
|
||||
}
|
||||
}
|
||||
return $files;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the usage event logs directory path.
|
||||
*/
|
||||
protected function getUsageEventLogsPath(): string
|
||||
{
|
||||
return StatisticsHelper::getUsageStatsDirPath() . '/usageEventLogs';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current day usage event log name.
|
||||
*/
|
||||
protected function getUsageEventCurrentDayLogName(): string
|
||||
{
|
||||
return 'usage_events_' . date('Ymd') . '.log';
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user