141 lines
4.6 KiB
PHP
141 lines
4.6 KiB
PHP
<?php
|
|
|
|
/**
|
|
* @file classes/search/SubmissionSearchIndex.php
|
|
*
|
|
* Copyright (c) 2014-2021 Simon Fraser University
|
|
* Copyright (c) 2003-2021 John Willinsky
|
|
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
|
*
|
|
* @class SubmissionSearchIndex
|
|
*
|
|
* @ingroup search
|
|
*
|
|
* @brief Class to maintain a submission search index.
|
|
*/
|
|
|
|
namespace PKP\search;
|
|
|
|
use APP\submission\Submission;
|
|
use PKP\config\Config;
|
|
use PKP\core\PKPString;
|
|
|
|
abstract class SubmissionSearchIndex
|
|
{
|
|
public const SEARCH_STOPWORDS_FILE = 'lib/pkp/registry/stopwords.txt';
|
|
|
|
// Words are truncated to at most this length
|
|
public const SEARCH_KEYWORD_MAX_LENGTH = 40;
|
|
|
|
/**
|
|
* Split a string into a clean array of keywords
|
|
*
|
|
* @param string|array $text
|
|
* @param bool $allowWildcards
|
|
*
|
|
* @return string[] of keywords
|
|
*/
|
|
public static function filterKeywords($text, $allowWildcards = false, bool $allowShortWords = false, bool $allowNumericWords = false): array
|
|
{
|
|
$minLength = Config::getVar('search', 'min_word_length');
|
|
$stopwords = static::loadStopwords();
|
|
|
|
// Join multiple lines into a single string
|
|
if (is_array($text)) {
|
|
$text = join("\n", $text);
|
|
}
|
|
|
|
// Attempts to fix bad UTF-8 characters
|
|
$previous = mb_substitute_character();
|
|
mb_substitute_character('none');
|
|
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
|
|
mb_substitute_character($previous);
|
|
|
|
// Removes all control (C) characters, marks (M), punctuations (P), symbols (S) and separators (Z) except "*" (which is addressed below)
|
|
$text = PKPString::regexp_replace('/(?!\*)[\\p{C}\\p{M}\\p{P}\\p{S}\\p{Z}]+/', ' ', $text);
|
|
$text = PKPString::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $text);
|
|
$text = PKPString::strtolower($text);
|
|
|
|
// Split into words
|
|
$words = PKPString::regexp_split('/\s+/', $text);
|
|
|
|
// FIXME Do not perform further filtering for some fields, e.g., author names?
|
|
|
|
$keywords = [];
|
|
foreach ($words as $word) {
|
|
// Ignores: stop words, short words (when $allowShortWords is false) and words composed solely of numbers (when $allowNumericWords is false)
|
|
if (empty($stopwords[$word]) && ($allowShortWords || PKPString::strlen($word) >= $minLength) && ($allowNumericWords || !is_numeric($word))) {
|
|
$keywords[] = PKPString::substr($word, 0, static::SEARCH_KEYWORD_MAX_LENGTH);
|
|
}
|
|
}
|
|
return $keywords;
|
|
}
|
|
|
|
/**
|
|
* Return list of stopwords.
|
|
* FIXME: Should this be locale-specific?
|
|
*
|
|
* @return array<string,int> Stop words (in lower case) as keys and 1 as value
|
|
*/
|
|
protected static function loadStopwords()
|
|
{
|
|
static $searchStopwords;
|
|
|
|
return $searchStopwords ??= array_fill_keys(
|
|
collect(file(base_path(static::SEARCH_STOPWORDS_FILE)))
|
|
->map(fn (string $word) => trim($word))
|
|
// Ignore comments/line-breaks
|
|
->filter(fn (string $word) => !empty($word) && $word[0] !== '#')
|
|
// Include a map for empty words
|
|
->push('')
|
|
->toArray(),
|
|
1
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Let the indexing back-end know that the current transaction
|
|
* finished so that the index can be batch-updated.
|
|
*/
|
|
abstract public function submissionChangesFinished();
|
|
|
|
/**
|
|
* Signal to the indexing back-end that the metadata of a submission
|
|
* changed.
|
|
*
|
|
* Push indexing implementations will try to immediately update
|
|
* the index to reflect the changes. Pull implementations will
|
|
* mark articles as "changed" and let the indexing back-end decide
|
|
* the best point in time to actually index the changed data.
|
|
*
|
|
* @param Submission $submission
|
|
*/
|
|
abstract public function submissionMetadataChanged($submission);
|
|
|
|
/**
|
|
* Remove indexed file contents for a submission
|
|
*
|
|
* @param Submission $submission
|
|
*/
|
|
abstract public function clearSubmissionFiles($submission);
|
|
|
|
/**
|
|
* Delete a submission's search indexing
|
|
*
|
|
* @param int $type optional
|
|
* @param int $assocId optional
|
|
*/
|
|
abstract public function deleteTextIndex(
|
|
int $submissionId,
|
|
$type = null,
|
|
$assocId = null
|
|
);
|
|
}
|
|
|
|
if (!PKP_STRICT_MODE) {
|
|
class_alias('\PKP\search\SubmissionSearchIndex', '\SubmissionSearchIndex');
|
|
foreach (['SEARCH_STOPWORDS_FILE', 'SEARCH_KEYWORD_MAX_LENGTH'] as $constantName) {
|
|
define($constantName, constant('\SubmissionSearchIndex::' . $constantName));
|
|
}
|
|
}
|