first commit

This commit is contained in:
CHIEFSOFT\ameye
2024-06-08 17:09:23 -04:00
commit df3a033196
17887 changed files with 8637778 additions and 0 deletions
+162
View File
@@ -0,0 +1,162 @@
<?php
/**
* @defgroup search Search
* Implements search tools, such as file parsers, workflow integration,
* indexing, querying, etc.
*/
/**
* @file classes/search/SearchFileParser.php
*
* Copyright (c) 2014-2021 Simon Fraser University
* Copyright (c) 2000-2021 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class SearchFileParser
*
* @ingroup search
*
* @brief Abstract class to extract search text from a given file.
*/
namespace PKP\search;
use Exception;
use PKP\config\Config;
use PKP\submissionFile\SubmissionFile;
class SearchFileParser
{
/** @var string the complete path to the file */
public $filePath;
/** @var resource file handle */
public $fp;
/**
* Constructor.
*
* @param string $filePath
*/
public function __construct($filePath)
{
$this->filePath = $filePath;
}
/**
* Return the path to the file.
*
* @return string
*/
public function getFilePath()
{
return $this->filePath;
}
/**
* Change the file path.
*
* @param string $filePath
*/
public function setFilePath($filePath)
{
$this->filePath = $filePath;
}
/**
* Open the file.
*
* @return bool
*/
public function open()
{
if (!($this->fp = @fopen($this->filePath, 'rb'))) {
throw new Exception("Failed to parse the file \"{$this->filePath}\". Last error: " . error_get_last());
}
return true;
}
/**
* Close the file.
*/
public function close()
{
if ($this->fp) {
fclose($this->fp);
}
}
/**
* Read and return the next block/line of text.
*
* @return string (false on EOF)
*/
public function read()
{
if (!$this->fp || feof($this->fp)) {
return false;
}
return $this->doRead();
}
/**
* Read from the file pointer.
*
* @return string
*/
public function doRead()
{
return fgets($this->fp);
}
//
// Static methods
//
/**
* Create a text parser for a file.
*
* @param SubmissionFile $submissionFile
*
* @return ?SearchFileParser
*/
public static function fromFile($submissionFile)
{
$fullPath = rtrim(Config::getVar('files', 'files_dir'), '/') . '/' . $submissionFile->getData('path');
return static::fromFileType($submissionFile->getData('mimetype'), $fullPath);
}
/**
* Create a text parser for a file.
*
* @param string $type
* @param string $path
*
* @return ?SearchFileParser
*/
public static function fromFileType($type, $path)
{
if (Config::getVar('search', "index[{$type}]")) {
$parserType = 'process';
} else {
// If an indexer definition exists, but its value is falsy, we assume the user wants to disable the default handler
$parserType = Config::hasVar('search', "index[{$type}]") ? 'disabled' : $type;
}
return match ($parserType) {
// External process
'process' => new SearchHelperParser($type, $path),
// Text processor
'text/plain' => new static($path),
// HTML/XML processor
'text/html', 'text/xml', 'application/xhtml', 'application/xml' => new SearchHTMLParser($path),
// Disabled/no suitable parser
default => null
};
}
}
if (!PKP_STRICT_MODE) {
class_alias('\PKP\search\SearchFileParser', '\SearchFileParser');
}
@@ -0,0 +1,35 @@
<?php
/**
* @file classes/search/SearchHTMLParser.php
*
* Copyright (c) 2014-2021 Simon Fraser University
* Copyright (c) 2000-2021 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class SearchHTMLParser
*
* @ingroup search
*
* @brief Class to extract text from an HTML file.
*/
namespace PKP\search;
class SearchHTMLParser extends SearchFileParser
{
public function doRead()
{
// strip HTML tags from the read line
$line = strip_tags(fgets($this->fp));
// convert HTML entities to valid UTF-8 characters
$line = html_entity_decode($line, ENT_COMPAT, 'UTF-8');
return $line;
}
}
if (!PKP_STRICT_MODE) {
class_alias('\PKP\search\SearchHTMLParser', '\SearchHTMLParser');
}
@@ -0,0 +1,59 @@
<?php
/**
* @file classes/search/SearchHelperParser.php
*
* Copyright (c) 2014-2021 Simon Fraser University
* Copyright (c) 2000-2021 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class SearchHelperParser
*
* @ingroup search
*
* @brief Class to extract text from a file using an external helper program.
*/
namespace PKP\search;
use Exception;
use PKP\config\Config;
class SearchHelperParser extends SearchFileParser
{
/** @var string Type should match an index[$type] setting in the "search" section of config.inc.php */
public $type;
private $command;
public function __construct($type, $filePath)
{
parent::__construct($filePath);
$this->type = $type;
}
public function open()
{
$prog = Config::getVar('search', 'index[' . $this->type . ']');
if (isset($prog)) {
$this->command = sprintf($prog, escapeshellarg($this->getFilePath()));
if (!($this->fp = @popen($this->command, 'r'))) {
throw new Exception("Failed to parse file {$this->getFilePath()} through the command: {$this->command}\nLast error: " . error_get_last());
}
return true;
}
return false;
}
public function close()
{
if ($this->fp && ($exitCode = pclose($this->fp))) {
throw new Exception("The indexation process exited with the code \"{$exitCode}\", perhaps the command failed: {$this->command}");
}
}
}
if (!PKP_STRICT_MODE) {
class_alias('\PKP\search\SearchHelperParser', '\SearchHelperParser');
}
+433
View File
@@ -0,0 +1,433 @@
<?php
/**
* @file classes/search/SubmissionSearch.php
*
* Copyright (c) 2014-2021 Simon Fraser University
* Copyright (c) 2003-2021 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class SubmissionSearch
*
* @ingroup search
*
* @see SubmissionSearchDAO
*
* @brief Class for retrieving search results.
*
* FIXME: NEAR; precedence w/o parens?; stemming; weighted counting
*/
namespace PKP\search;
use APP\core\Application;
use APP\core\Request;
use PKP\config\Config;
use PKP\context\Context;
use PKP\core\PKPString;
use PKP\core\VirtualArrayIterator;
use PKP\db\DAO;
use PKP\plugins\Hook;
use PKP\user\User;
abstract class SubmissionSearch
{
// Search types
public const SUBMISSION_SEARCH_AUTHOR = 1;
public const SUBMISSION_SEARCH_TITLE = 2;
public const SUBMISSION_SEARCH_ABSTRACT = 4;
public const SUBMISSION_SEARCH_DISCIPLINE = 8;
public const SUBMISSION_SEARCH_SUBJECT = 16;
public const SUBMISSION_SEARCH_KEYWORD = 17;
public const SUBMISSION_SEARCH_TYPE = 32;
public const SUBMISSION_SEARCH_COVERAGE = 64;
public const SUBMISSION_SEARCH_GALLEY_FILE = 128;
public const SUBMISSION_SEARCH_SUPPLEMENTARY_FILE = 256;
public const SUBMISSION_SEARCH_INDEX_TERMS = 120;
public const SUBMISSION_SEARCH_DEFAULT_RESULT_LIMIT = 20;
/**
* Constructor
*/
public function __construct()
{
}
/**
* Parses a search query string.
* Supports +/-, AND/OR, parens
*
* @param string $query
*
* @return array of the form ('+' => <required>, '' => <optional>, '-' => excluded)
*/
public function _parseQuery($query)
{
$count = PKPString::regexp_match_all('/(\+|\-|)("[^"]+"|\(|\)|[^\s\)]+)/', $query, $matches);
$pos = 0;
return $this->_parseQueryInternal($matches[1], $matches[2], $pos, $count);
}
/**
* Query parsing helper routine.
* Returned structure is based on that used by the Search::QueryParser Perl module.
*/
public function _parseQueryInternal($signTokens, $tokens, &$pos, $total)
{
$return = ['+' => [], '' => [], '-' => []];
$postBool = $preBool = '';
$submissionSearchIndex = Application::getSubmissionSearchIndex();
$notOperator = PKPString::strtolower(__('search.operator.not'));
$andOperator = PKPString::strtolower(__('search.operator.and'));
$orOperator = PKPString::strtolower(__('search.operator.or'));
while ($pos < $total) {
if (!empty($signTokens[$pos])) {
$sign = $signTokens[$pos];
} elseif (empty($sign)) {
$sign = '+';
}
$token = PKPString::strtolower($tokens[$pos++]);
switch ($token) {
case $notOperator:
$sign = '-';
break;
case ')':
return $return;
case '(':
$token = $this->_parseQueryInternal($signTokens, $tokens, $pos, $total);
// no break
default:
$postBool = '';
if ($pos < $total) {
$peek = PKPString::strtolower($tokens[$pos]);
if ($peek == $orOperator) {
$postBool = 'or';
$pos++;
} elseif ($peek == $andOperator) {
$postBool = 'and';
$pos++;
}
}
$bool = empty($postBool) ? $preBool : $postBool;
$preBool = $postBool;
if ($bool == 'or') {
$sign = '';
}
if (is_array($token)) {
$k = $token;
} else {
$k = $submissionSearchIndex->filterKeywords($token, true);
}
if (!empty($k)) {
$return[$sign][] = $k;
}
$sign = '';
break;
}
}
return $return;
}
/**
* Takes an unordered list of search result data, flattens it, orders it
* and excludes unwanted results.
*
* @return array An ordered and flattened list of article IDs.
*/
public function _getMergedArray($context, &$keywords, $publishedFrom, $publishedTo)
{
$resultsPerKeyword = Config::getVar('search', 'results_per_keyword', 100);
$mergedKeywords = ['+' => [], '' => [], '-' => []];
foreach ($keywords as $type => $keyword) {
if (!empty($keyword['+'])) {
$mergedKeywords['+'][] = ['type' => $type, '+' => $keyword['+'], '' => [], '-' => []];
}
if (!empty($keyword[''])) {
$mergedKeywords[''][] = ['type' => $type, '+' => [], '' => $keyword[''], '-' => []];
}
if (!empty($keyword['-'])) {
$mergedKeywords['-'][] = ['type' => $type, '+' => [], '' => $keyword['-'], '-' => []];
}
}
return $this->_getMergedKeywordResults($context, $mergedKeywords, null, $publishedFrom, $publishedTo, $resultsPerKeyword);
}
/**
* Recursive helper for _getMergedArray.
*/
public function _getMergedKeywordResults($context, &$keyword, $type, $publishedFrom, $publishedTo, $resultsPerKeyword)
{
$mergedResults = null;
if (isset($keyword['type'])) {
$type = $keyword['type'];
}
foreach ($keyword['+'] as $phrase) {
$results = $this->_getMergedPhraseResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
if ($mergedResults === null) {
$mergedResults = $results;
} else {
foreach ($mergedResults as $submissionId => $data) {
if (isset($results[$submissionId])) {
$mergedResults[$submissionId]['count'] += $results[$submissionId]['count'];
} else {
unset($mergedResults[$submissionId]);
}
}
}
}
if ($mergedResults == null) {
$mergedResults = [];
}
if (!empty($mergedResults) || empty($keyword['+'])) {
foreach ($keyword[''] as $phrase) {
$results = $this->_getMergedPhraseResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
foreach ($results as $submissionId => $data) {
if (isset($mergedResults[$submissionId])) {
$mergedResults[$submissionId]['count'] += $data['count'];
} elseif (empty($keyword['+'])) {
$mergedResults[$submissionId] = $data;
}
}
}
foreach ($keyword['-'] as $phrase) {
$results = $this->_getMergedPhraseResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
foreach ($results as $submissionId => $count) {
if (isset($mergedResults[$submissionId])) {
unset($mergedResults[$submissionId]);
}
}
}
}
return $mergedResults;
}
/**
* Recursive helper for _getMergedArray.
*/
protected function _getMergedPhraseResults($context, &$phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword)
{
if (isset($phrase['+'])) {
return $this->_getMergedKeywordResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
}
return $this->getSearchDao()->getPhraseResults(
$context,
$phrase,
$publishedFrom,
$publishedTo,
$type,
$resultsPerKeyword
);
}
/**
* Return an array of search results matching the supplied
* keyword IDs in decreasing order of match quality.
* Keywords are supplied in an array of the following format:
* $keywords[SUBMISSION_SEARCH_AUTHOR] = array('John', 'Doe');
* $keywords[SUBMISSION_SEARCH_...] = array(...);
* $keywords[null] = array('Matches', 'All', 'Fields');
*
* @param Request $request
* @param Context $context The context to search
* @param array $keywords List of keywords
* @param string $error a reference to a variable that will
* contain an error message if the search service produces
* an error.
* @param string $publishedFrom Search-from date
* @param string $publishedTo Search-to date
* @param ?\PKP\db\DBResultRange $rangeInfo Information on the range of results to return
* @param array $exclude An array of article IDs to exclude from the result.
*
* @return VirtualArrayIterator An iterator with one entry per retrieved
* article containing the article, published submission, issue, context, etc.
*/
public function retrieveResults($request, $context, $keywords, &$error, $publishedFrom = null, $publishedTo = null, $rangeInfo = null, $exclude = [])
{
// Pagination
if ($rangeInfo && $rangeInfo->isValid()) {
$page = $rangeInfo->getPage();
$itemsPerPage = $rangeInfo->getCount();
} else {
$page = 1;
$itemsPerPage = self::SUBMISSION_SEARCH_DEFAULT_RESULT_LIMIT;
}
// Result set ordering.
[$orderBy, $orderDir] = $this->getResultSetOrdering($request);
// Check whether a search plug-in jumps in to provide ranked search results.
$totalResults = null;
$results = null;
$hookResult = Hook::call(
'SubmissionSearch::retrieveResults',
[&$context, &$keywords, $publishedFrom, $publishedTo, $orderBy, $orderDir, $exclude, $page, $itemsPerPage, &$totalResults, &$error, &$results]
);
// If no search plug-in is activated then fall back to the
// default database search implementation.
if ($hookResult === false) {
// Parse the query.
foreach ($keywords as $searchType => $query) {
$keywords[$searchType] = $this->_parseQuery($query);
}
// Fetch all the results from all the keywords into one array
// (mergedResults), where mergedResults[submission_id]
// = sum of all the occurrences for all keywords associated with
// that article ID.
$mergedResults = $this->_getMergedArray($context, $keywords, $publishedFrom, $publishedTo);
// Convert mergedResults into an array (frequencyIndicator =>
// $submissionId).
// The frequencyIndicator is a synthetically-generated number,
// where higher is better, indicating the quality of the match.
// It is generated here in such a manner that matches with
// identical frequency do not collide.
$results = $this->getSparseArray($mergedResults, $orderBy, $orderDir, $exclude);
$totalResults = count($results);
// Use only the results for the specified page.
$offset = $itemsPerPage * ($page - 1);
$length = max($totalResults - $offset, 0);
$length = min($itemsPerPage, $length);
if ($length == 0) {
$results = [];
} else {
$results = array_slice(
$results,
$offset,
$length
);
}
}
// Take the range of results and retrieve the Article, Journal,
// and associated objects.
$results = $this->formatResults($results, $request->getUser());
// Return the appropriate iterator.
return new VirtualArrayIterator($results, $totalResults, $page, $itemsPerPage);
}
/**
* Return the available options for the result
* set ordering direction.
*
* @return array
*/
public function getResultSetOrderingDirectionOptions()
{
return [
'asc' => __('search.results.orderDir.asc'),
'desc' => __('search.results.orderDir.desc')
];
}
/**
* Return the currently selected result
* set ordering option (default: descending relevance).
*
* @param Request $request
*
* @return array An array with the order field as the
* first entry and the order direction as the second
* entry.
*/
public function getResultSetOrdering($request)
{
// Order field.
$orderBy = $request->getUserVar('orderBy');
$orderByOptions = $this->getResultSetOrderingOptions($request);
if (is_null($orderBy) || !in_array($orderBy, array_keys($orderByOptions))) {
$orderBy = 'score';
}
// Ordering direction.
$orderDir = $request->getUserVar('orderDir');
$orderDirOptions = $this->getResultSetOrderingDirectionOptions();
if (is_null($orderDir) || !in_array($orderDir, array_keys($orderDirOptions))) {
$orderDir = $this->getDefaultOrderDir($orderBy);
}
return [$orderBy, $orderDir];
}
//
// Methods to be implemented by subclasses.
//
/**
* See implementation of retrieveResults for a description of this
* function.
*
* Note that this function is also called externally to fetch
* results for the title index, and possibly elsewhere.
*
* @param array $results
* @param User $user optional (if availability information is desired)
*
* @return array
*/
abstract public function formatResults($results, $user = null);
/**
* Return the available options for result set ordering.
*
* @param Request $request
*
* @return array
*/
abstract public function getResultSetOrderingOptions($request);
/**
* See implementation of retrieveResults for a description of this
* function.
*/
abstract protected function getSparseArray($unorderedResults, $orderBy, $orderDir, $exclude);
/**
* Return the default order direction.
*
* @param string $orderBy
*
* @return string
*/
abstract protected function getDefaultOrderDir($orderBy);
/**
* Return the search DAO
*
* @return DAO
*/
abstract protected function getSearchDao();
}
if (!PKP_STRICT_MODE) {
class_alias('\PKP\search\SubmissionSearch', '\SubmissionSearch');
foreach ([
'SUBMISSION_SEARCH_AUTHOR',
'SUBMISSION_SEARCH_TITLE',
'SUBMISSION_SEARCH_ABSTRACT',
'SUBMISSION_SEARCH_DISCIPLINE',
'SUBMISSION_SEARCH_SUBJECT',
'SUBMISSION_SEARCH_KEYWORD',
'SUBMISSION_SEARCH_TYPE',
'SUBMISSION_SEARCH_COVERAGE',
'SUBMISSION_SEARCH_GALLEY_FILE',
'SUBMISSION_SEARCH_SUPPLEMENTARY_FILE',
'SUBMISSION_SEARCH_INDEX_TERMS',
'SUBMISSION_SEARCH_DEFAULT_RESULT_LIMIT',
] as $constantName) {
define($constantName, constant('\SubmissionSearch::' . $constantName));
}
}
@@ -0,0 +1,179 @@
<?php
/**
* @file classes/search/SubmissionSearchDAO.php
*
* Copyright (c) 2014-2021 Simon Fraser University
* Copyright (c) 2003-2021 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class SubmissionSearchDAO
*
* @ingroup search
*
* @see SubmissionSearch
*
* @brief DAO class for submission search index.
*/
namespace PKP\search;
use Illuminate\Database\Query\Builder;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\DB;
use PKP\core\PKPString;
class SubmissionSearchDAO extends \PKP\db\DAO
{
/**
* Delete all keywords for a submission.
*
* @param int $submissionId
* @param int $type optional
* @param int $assocId optional
*/
public function deleteSubmissionKeywords($submissionId, $type = null, $assocId = null)
{
DB::table('submission_search_objects')
->where('submission_id', '=', $submissionId)
->when(isset($type), fn (Builder $query) => $query->where('type', '=', $type))
->when(isset($assocId), fn (Builder $query) => $query->where('assoc_id', '=', $assocId))
->delete();
}
/**
* Add a submission object to the index (if already exists, indexed keywords are cleared).
*
* @param int $submissionId
* @param int $type
* @param ?int $assocId
*
* @return int the object ID
*/
public function insertObject($submissionId, $type, $assocId)
{
$objectId = DB::table('submission_search_objects')
->where('submission_id', '=', $submissionId)
->where('type', '=', $type)
->when($assocId !== null, fn (Builder $query) => $query->where('assoc_id', '=', $assocId))
->value('object_id');
if ($objectId) {
// Clear the old keywords
DB::table('submission_search_object_keywords')
->where('object_id', '=', $objectId)
->delete();
return $objectId;
}
return DB::table('submission_search_objects')->insertGetId([
'submission_id' => $submissionId,
'type' => $type,
'assoc_id' => $assocId
], 'object_id');
}
/**
* Index an occurrence of a keyword in an object.
*/
public function insertObjectKeywords(int $objectId, array $keywords): void
{
/** @var array<string,?int> */
static $keywordMap = [];
// Discard long keywords
$keywords = collect($keywords)
->filter(fn (string $keyword) => PKPString::strlen($keyword) <= SubmissionSearchIndex::SEARCH_KEYWORD_MAX_LENGTH);
// Quit if there's no keywords
if (!$keywords->count()) {
return;
}
$chunkedUnmappedKeywords = $keywords
// Skip mapped keywords
->diff(array_keys($keywordMap))
// Chunk by 1000
->chunk(1000);
$chunkedUnmappedKeywords->map(function (Collection $keywords) use (&$keywordMap) {
$missingKeywords = collect();
// Update the map with the existing IDs. Due to the database collation, very similar keywords might end up with the same ID
foreach ($this->getKeywordIdMap($keywords) as $keyword => $id) {
if ($id) {
$keywordMap[$keyword] = $id;
} else {
$missingKeywords->push($keyword);
}
}
// Batch insert keywords that don't exist using the "ignore" feature to deal with collation issues (e.g. attempt to insert "a" and "ã" at the same time might fail)
// This isn't executed first just to avoid "burning" IDs due to existing keywords
DB::table('submission_search_keyword_list')->insertOrIgnore(
$missingKeywords
->map(fn (string $keyword) => ['keyword_text' => $keyword])
->toArray()
);
// Grab the the map with the new IDs
foreach ($this->getKeywordIdMap($missingKeywords) as $keyword => $id) {
$keywordMap[$keyword] = $id;
}
});
// Get the current position
$position = DB::table('submission_search_object_keywords')
->where('object_id', $objectId)
->max('pos') ?? -1;
$keywords
// Skip missed keywords (probably not needed, present for correctness)
->filter(fn (string $keyword) => isset($keywordMap[$keyword]))
// Convert to batch insert format
->map(function (string $keyword) use (&$position, $objectId, $keywordMap) {
return [
'object_id' => $objectId,
'keyword_id' => $keywordMap[$keyword],
'pos' => ++$position
];
})
// Chunk by 1000
->chunk(1000)
// Batch insert
->map(fn (Collection $data) => DB::table('submission_search_object_keywords')->insert($data->toArray()));
}
/**
* Clear the search index.
*/
public function clearIndex()
{
DB::table('submission_search_objects')->delete();
DB::table('submission_search_keyword_list')->delete();
}
/**
* Retrieves a keyword => ID map for the given keywords
*
* @param Collection<int,string>
* @return Collection<string,int>
*/
private function getKeywordIdMap(Collection $keywords): Collection
{
if (!$keywords->count()) {
return collect();
}
// Generates a temporary keyword table (sequence of "SELECT ? AS keyword UNION ALL SELECT ?...")
return DB::table(
DB::raw('(SELECT ? AS keyword' . str_repeat(' UNION ALL SELECT ?', $keywords->count() - 1) . ') AS tmp')
)
->setBindings($keywords->toArray(), 'from')
->leftJoin('submission_search_keyword_list AS sskl', 'sskl.keyword_text', '=', 'tmp.keyword')
->pluck('sskl.keyword_id', 'tmp.keyword');
}
}
if (!PKP_STRICT_MODE) {
class_alias('\PKP\search\SubmissionSearchDAO', '\SubmissionSearchDAO');
}
@@ -0,0 +1,140 @@
<?php
/**
* @file classes/search/SubmissionSearchIndex.php
*
* Copyright (c) 2014-2021 Simon Fraser University
* Copyright (c) 2003-2021 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class SubmissionSearchIndex
*
* @ingroup search
*
* @brief Class to maintain a submission search index.
*/
namespace PKP\search;
use APP\submission\Submission;
use PKP\config\Config;
use PKP\core\PKPString;
abstract class SubmissionSearchIndex
{
public const SEARCH_STOPWORDS_FILE = 'lib/pkp/registry/stopwords.txt';
// Words are truncated to at most this length
public const SEARCH_KEYWORD_MAX_LENGTH = 40;
/**
* Split a string into a clean array of keywords
*
* @param string|array $text
* @param bool $allowWildcards
*
* @return string[] of keywords
*/
public static function filterKeywords($text, $allowWildcards = false, bool $allowShortWords = false, bool $allowNumericWords = false): array
{
$minLength = Config::getVar('search', 'min_word_length');
$stopwords = static::loadStopwords();
// Join multiple lines into a single string
if (is_array($text)) {
$text = join("\n", $text);
}
// Attempts to fix bad UTF-8 characters
$previous = mb_substitute_character();
mb_substitute_character('none');
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
mb_substitute_character($previous);
// Removes all control (C) characters, marks (M), punctuations (P), symbols (S) and separators (Z) except "*" (which is addressed below)
$text = PKPString::regexp_replace('/(?!\*)[\\p{C}\\p{M}\\p{P}\\p{S}\\p{Z}]+/', ' ', $text);
$text = PKPString::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $text);
$text = PKPString::strtolower($text);
// Split into words
$words = PKPString::regexp_split('/\s+/', $text);
// FIXME Do not perform further filtering for some fields, e.g., author names?
$keywords = [];
foreach ($words as $word) {
// Ignores: stop words, short words (when $allowShortWords is false) and words composed solely of numbers (when $allowNumericWords is false)
if (empty($stopwords[$word]) && ($allowShortWords || PKPString::strlen($word) >= $minLength) && ($allowNumericWords || !is_numeric($word))) {
$keywords[] = PKPString::substr($word, 0, static::SEARCH_KEYWORD_MAX_LENGTH);
}
}
return $keywords;
}
/**
* Return list of stopwords.
* FIXME: Should this be locale-specific?
*
* @return array<string,int> Stop words (in lower case) as keys and 1 as value
*/
protected static function loadStopwords()
{
static $searchStopwords;
return $searchStopwords ??= array_fill_keys(
collect(file(base_path(static::SEARCH_STOPWORDS_FILE)))
->map(fn (string $word) => trim($word))
// Ignore comments/line-breaks
->filter(fn (string $word) => !empty($word) && $word[0] !== '#')
// Include a map for empty words
->push('')
->toArray(),
1
);
}
/**
* Let the indexing back-end know that the current transaction
* finished so that the index can be batch-updated.
*/
abstract public function submissionChangesFinished();
/**
* Signal to the indexing back-end that the metadata of a submission
* changed.
*
* Push indexing implementations will try to immediately update
* the index to reflect the changes. Pull implementations will
* mark articles as "changed" and let the indexing back-end decide
* the best point in time to actually index the changed data.
*
* @param Submission $submission
*/
abstract public function submissionMetadataChanged($submission);
/**
* Remove indexed file contents for a submission
*
* @param Submission $submission
*/
abstract public function clearSubmissionFiles($submission);
/**
* Delete a submission's search indexing
*
* @param int $type optional
* @param int $assocId optional
*/
abstract public function deleteTextIndex(
int $submissionId,
$type = null,
$assocId = null
);
}
if (!PKP_STRICT_MODE) {
class_alias('\PKP\search\SubmissionSearchIndex', '\SubmissionSearchIndex');
foreach (['SEARCH_STOPWORDS_FILE', 'SEARCH_KEYWORD_MAX_LENGTH'] as $constantName) {
define($constantName, constant('\SubmissionSearchIndex::' . $constantName));
}
}