first commit
This commit is contained in:
@@ -0,0 +1,162 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @defgroup search Search
|
||||
* Implements search tools, such as file parsers, workflow integration,
|
||||
* indexing, querying, etc.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file classes/search/SearchFileParser.php
|
||||
*
|
||||
* Copyright (c) 2014-2021 Simon Fraser University
|
||||
* Copyright (c) 2000-2021 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class SearchFileParser
|
||||
*
|
||||
* @ingroup search
|
||||
*
|
||||
* @brief Abstract class to extract search text from a given file.
|
||||
*/
|
||||
|
||||
namespace PKP\search;
|
||||
|
||||
use Exception;
|
||||
use PKP\config\Config;
|
||||
use PKP\submissionFile\SubmissionFile;
|
||||
|
||||
class SearchFileParser
|
||||
{
|
||||
/** @var string the complete path to the file */
|
||||
public $filePath;
|
||||
|
||||
/** @var resource file handle */
|
||||
public $fp;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param string $filePath
|
||||
*/
|
||||
public function __construct($filePath)
|
||||
{
|
||||
$this->filePath = $filePath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the path to the file.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getFilePath()
|
||||
{
|
||||
return $this->filePath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Change the file path.
|
||||
*
|
||||
* @param string $filePath
|
||||
*/
|
||||
public function setFilePath($filePath)
|
||||
{
|
||||
$this->filePath = $filePath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the file.
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function open()
|
||||
{
|
||||
if (!($this->fp = @fopen($this->filePath, 'rb'))) {
|
||||
throw new Exception("Failed to parse the file \"{$this->filePath}\". Last error: " . error_get_last());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the file.
|
||||
*/
|
||||
public function close()
|
||||
{
|
||||
if ($this->fp) {
|
||||
fclose($this->fp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read and return the next block/line of text.
|
||||
*
|
||||
* @return string (false on EOF)
|
||||
*/
|
||||
public function read()
|
||||
{
|
||||
if (!$this->fp || feof($this->fp)) {
|
||||
return false;
|
||||
}
|
||||
return $this->doRead();
|
||||
}
|
||||
|
||||
/**
|
||||
* Read from the file pointer.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function doRead()
|
||||
{
|
||||
return fgets($this->fp);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Static methods
|
||||
//
|
||||
|
||||
/**
|
||||
* Create a text parser for a file.
|
||||
*
|
||||
* @param SubmissionFile $submissionFile
|
||||
*
|
||||
* @return ?SearchFileParser
|
||||
*/
|
||||
public static function fromFile($submissionFile)
|
||||
{
|
||||
$fullPath = rtrim(Config::getVar('files', 'files_dir'), '/') . '/' . $submissionFile->getData('path');
|
||||
return static::fromFileType($submissionFile->getData('mimetype'), $fullPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a text parser for a file.
|
||||
*
|
||||
* @param string $type
|
||||
* @param string $path
|
||||
*
|
||||
* @return ?SearchFileParser
|
||||
*/
|
||||
public static function fromFileType($type, $path)
|
||||
{
|
||||
if (Config::getVar('search', "index[{$type}]")) {
|
||||
$parserType = 'process';
|
||||
} else {
|
||||
// If an indexer definition exists, but its value is falsy, we assume the user wants to disable the default handler
|
||||
$parserType = Config::hasVar('search', "index[{$type}]") ? 'disabled' : $type;
|
||||
}
|
||||
return match ($parserType) {
|
||||
// External process
|
||||
'process' => new SearchHelperParser($type, $path),
|
||||
// Text processor
|
||||
'text/plain' => new static($path),
|
||||
// HTML/XML processor
|
||||
'text/html', 'text/xml', 'application/xhtml', 'application/xml' => new SearchHTMLParser($path),
|
||||
// Disabled/no suitable parser
|
||||
default => null
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (!PKP_STRICT_MODE) {
|
||||
class_alias('\PKP\search\SearchFileParser', '\SearchFileParser');
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file classes/search/SearchHTMLParser.php
|
||||
*
|
||||
* Copyright (c) 2014-2021 Simon Fraser University
|
||||
* Copyright (c) 2000-2021 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class SearchHTMLParser
|
||||
*
|
||||
* @ingroup search
|
||||
*
|
||||
* @brief Class to extract text from an HTML file.
|
||||
*/
|
||||
|
||||
namespace PKP\search;
|
||||
|
||||
class SearchHTMLParser extends SearchFileParser
|
||||
{
|
||||
public function doRead()
|
||||
{
|
||||
// strip HTML tags from the read line
|
||||
$line = strip_tags(fgets($this->fp));
|
||||
|
||||
// convert HTML entities to valid UTF-8 characters
|
||||
$line = html_entity_decode($line, ENT_COMPAT, 'UTF-8');
|
||||
|
||||
return $line;
|
||||
}
|
||||
}
|
||||
|
||||
if (!PKP_STRICT_MODE) {
|
||||
class_alias('\PKP\search\SearchHTMLParser', '\SearchHTMLParser');
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file classes/search/SearchHelperParser.php
|
||||
*
|
||||
* Copyright (c) 2014-2021 Simon Fraser University
|
||||
* Copyright (c) 2000-2021 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class SearchHelperParser
|
||||
*
|
||||
* @ingroup search
|
||||
*
|
||||
* @brief Class to extract text from a file using an external helper program.
|
||||
*/
|
||||
|
||||
namespace PKP\search;
|
||||
|
||||
use Exception;
|
||||
use PKP\config\Config;
|
||||
|
||||
class SearchHelperParser extends SearchFileParser
|
||||
{
|
||||
/** @var string Type should match an index[$type] setting in the "search" section of config.inc.php */
|
||||
public $type;
|
||||
|
||||
private $command;
|
||||
|
||||
public function __construct($type, $filePath)
|
||||
{
|
||||
parent::__construct($filePath);
|
||||
$this->type = $type;
|
||||
}
|
||||
|
||||
public function open()
|
||||
{
|
||||
$prog = Config::getVar('search', 'index[' . $this->type . ']');
|
||||
if (isset($prog)) {
|
||||
$this->command = sprintf($prog, escapeshellarg($this->getFilePath()));
|
||||
if (!($this->fp = @popen($this->command, 'r'))) {
|
||||
throw new Exception("Failed to parse file {$this->getFilePath()} through the command: {$this->command}\nLast error: " . error_get_last());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function close()
|
||||
{
|
||||
if ($this->fp && ($exitCode = pclose($this->fp))) {
|
||||
throw new Exception("The indexation process exited with the code \"{$exitCode}\", perhaps the command failed: {$this->command}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!PKP_STRICT_MODE) {
|
||||
class_alias('\PKP\search\SearchHelperParser', '\SearchHelperParser');
|
||||
}
|
||||
@@ -0,0 +1,433 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file classes/search/SubmissionSearch.php
|
||||
*
|
||||
* Copyright (c) 2014-2021 Simon Fraser University
|
||||
* Copyright (c) 2003-2021 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class SubmissionSearch
|
||||
*
|
||||
* @ingroup search
|
||||
*
|
||||
* @see SubmissionSearchDAO
|
||||
*
|
||||
* @brief Class for retrieving search results.
|
||||
*
|
||||
* FIXME: NEAR; precedence w/o parens?; stemming; weighted counting
|
||||
*/
|
||||
|
||||
namespace PKP\search;
|
||||
|
||||
use APP\core\Application;
|
||||
use APP\core\Request;
|
||||
use PKP\config\Config;
|
||||
use PKP\context\Context;
|
||||
use PKP\core\PKPString;
|
||||
use PKP\core\VirtualArrayIterator;
|
||||
use PKP\db\DAO;
|
||||
use PKP\plugins\Hook;
|
||||
use PKP\user\User;
|
||||
|
||||
abstract class SubmissionSearch
|
||||
{
|
||||
// Search types
|
||||
public const SUBMISSION_SEARCH_AUTHOR = 1;
|
||||
public const SUBMISSION_SEARCH_TITLE = 2;
|
||||
public const SUBMISSION_SEARCH_ABSTRACT = 4;
|
||||
public const SUBMISSION_SEARCH_DISCIPLINE = 8;
|
||||
public const SUBMISSION_SEARCH_SUBJECT = 16;
|
||||
public const SUBMISSION_SEARCH_KEYWORD = 17;
|
||||
public const SUBMISSION_SEARCH_TYPE = 32;
|
||||
public const SUBMISSION_SEARCH_COVERAGE = 64;
|
||||
public const SUBMISSION_SEARCH_GALLEY_FILE = 128;
|
||||
public const SUBMISSION_SEARCH_SUPPLEMENTARY_FILE = 256;
|
||||
public const SUBMISSION_SEARCH_INDEX_TERMS = 120;
|
||||
|
||||
public const SUBMISSION_SEARCH_DEFAULT_RESULT_LIMIT = 20;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a search query string.
|
||||
* Supports +/-, AND/OR, parens
|
||||
*
|
||||
* @param string $query
|
||||
*
|
||||
* @return array of the form ('+' => <required>, '' => <optional>, '-' => excluded)
|
||||
*/
|
||||
public function _parseQuery($query)
|
||||
{
|
||||
$count = PKPString::regexp_match_all('/(\+|\-|)("[^"]+"|\(|\)|[^\s\)]+)/', $query, $matches);
|
||||
$pos = 0;
|
||||
return $this->_parseQueryInternal($matches[1], $matches[2], $pos, $count);
|
||||
}
|
||||
|
||||
/**
|
||||
* Query parsing helper routine.
|
||||
* Returned structure is based on that used by the Search::QueryParser Perl module.
|
||||
*/
|
||||
public function _parseQueryInternal($signTokens, $tokens, &$pos, $total)
|
||||
{
|
||||
$return = ['+' => [], '' => [], '-' => []];
|
||||
$postBool = $preBool = '';
|
||||
|
||||
$submissionSearchIndex = Application::getSubmissionSearchIndex();
|
||||
|
||||
$notOperator = PKPString::strtolower(__('search.operator.not'));
|
||||
$andOperator = PKPString::strtolower(__('search.operator.and'));
|
||||
$orOperator = PKPString::strtolower(__('search.operator.or'));
|
||||
while ($pos < $total) {
|
||||
if (!empty($signTokens[$pos])) {
|
||||
$sign = $signTokens[$pos];
|
||||
} elseif (empty($sign)) {
|
||||
$sign = '+';
|
||||
}
|
||||
$token = PKPString::strtolower($tokens[$pos++]);
|
||||
switch ($token) {
|
||||
case $notOperator:
|
||||
$sign = '-';
|
||||
break;
|
||||
case ')':
|
||||
return $return;
|
||||
case '(':
|
||||
$token = $this->_parseQueryInternal($signTokens, $tokens, $pos, $total);
|
||||
// no break
|
||||
default:
|
||||
$postBool = '';
|
||||
if ($pos < $total) {
|
||||
$peek = PKPString::strtolower($tokens[$pos]);
|
||||
if ($peek == $orOperator) {
|
||||
$postBool = 'or';
|
||||
$pos++;
|
||||
} elseif ($peek == $andOperator) {
|
||||
$postBool = 'and';
|
||||
$pos++;
|
||||
}
|
||||
}
|
||||
$bool = empty($postBool) ? $preBool : $postBool;
|
||||
$preBool = $postBool;
|
||||
if ($bool == 'or') {
|
||||
$sign = '';
|
||||
}
|
||||
if (is_array($token)) {
|
||||
$k = $token;
|
||||
} else {
|
||||
$k = $submissionSearchIndex->filterKeywords($token, true);
|
||||
}
|
||||
if (!empty($k)) {
|
||||
$return[$sign][] = $k;
|
||||
}
|
||||
$sign = '';
|
||||
break;
|
||||
}
|
||||
}
|
||||
return $return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes an unordered list of search result data, flattens it, orders it
|
||||
* and excludes unwanted results.
|
||||
*
|
||||
* @return array An ordered and flattened list of article IDs.
|
||||
*/
|
||||
public function _getMergedArray($context, &$keywords, $publishedFrom, $publishedTo)
|
||||
{
|
||||
$resultsPerKeyword = Config::getVar('search', 'results_per_keyword', 100);
|
||||
|
||||
$mergedKeywords = ['+' => [], '' => [], '-' => []];
|
||||
foreach ($keywords as $type => $keyword) {
|
||||
if (!empty($keyword['+'])) {
|
||||
$mergedKeywords['+'][] = ['type' => $type, '+' => $keyword['+'], '' => [], '-' => []];
|
||||
}
|
||||
if (!empty($keyword[''])) {
|
||||
$mergedKeywords[''][] = ['type' => $type, '+' => [], '' => $keyword[''], '-' => []];
|
||||
}
|
||||
if (!empty($keyword['-'])) {
|
||||
$mergedKeywords['-'][] = ['type' => $type, '+' => [], '' => $keyword['-'], '-' => []];
|
||||
}
|
||||
}
|
||||
return $this->_getMergedKeywordResults($context, $mergedKeywords, null, $publishedFrom, $publishedTo, $resultsPerKeyword);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive helper for _getMergedArray.
|
||||
*/
|
||||
public function _getMergedKeywordResults($context, &$keyword, $type, $publishedFrom, $publishedTo, $resultsPerKeyword)
|
||||
{
|
||||
$mergedResults = null;
|
||||
|
||||
if (isset($keyword['type'])) {
|
||||
$type = $keyword['type'];
|
||||
}
|
||||
|
||||
foreach ($keyword['+'] as $phrase) {
|
||||
$results = $this->_getMergedPhraseResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
|
||||
if ($mergedResults === null) {
|
||||
$mergedResults = $results;
|
||||
} else {
|
||||
foreach ($mergedResults as $submissionId => $data) {
|
||||
if (isset($results[$submissionId])) {
|
||||
$mergedResults[$submissionId]['count'] += $results[$submissionId]['count'];
|
||||
} else {
|
||||
unset($mergedResults[$submissionId]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($mergedResults == null) {
|
||||
$mergedResults = [];
|
||||
}
|
||||
|
||||
if (!empty($mergedResults) || empty($keyword['+'])) {
|
||||
foreach ($keyword[''] as $phrase) {
|
||||
$results = $this->_getMergedPhraseResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
|
||||
foreach ($results as $submissionId => $data) {
|
||||
if (isset($mergedResults[$submissionId])) {
|
||||
$mergedResults[$submissionId]['count'] += $data['count'];
|
||||
} elseif (empty($keyword['+'])) {
|
||||
$mergedResults[$submissionId] = $data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($keyword['-'] as $phrase) {
|
||||
$results = $this->_getMergedPhraseResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
|
||||
foreach ($results as $submissionId => $count) {
|
||||
if (isset($mergedResults[$submissionId])) {
|
||||
unset($mergedResults[$submissionId]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $mergedResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive helper for _getMergedArray.
|
||||
*/
|
||||
protected function _getMergedPhraseResults($context, &$phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword)
|
||||
{
|
||||
if (isset($phrase['+'])) {
|
||||
return $this->_getMergedKeywordResults($context, $phrase, $type, $publishedFrom, $publishedTo, $resultsPerKeyword);
|
||||
}
|
||||
|
||||
return $this->getSearchDao()->getPhraseResults(
|
||||
$context,
|
||||
$phrase,
|
||||
$publishedFrom,
|
||||
$publishedTo,
|
||||
$type,
|
||||
$resultsPerKeyword
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array of search results matching the supplied
|
||||
* keyword IDs in decreasing order of match quality.
|
||||
* Keywords are supplied in an array of the following format:
|
||||
* $keywords[SUBMISSION_SEARCH_AUTHOR] = array('John', 'Doe');
|
||||
* $keywords[SUBMISSION_SEARCH_...] = array(...);
|
||||
* $keywords[null] = array('Matches', 'All', 'Fields');
|
||||
*
|
||||
* @param Request $request
|
||||
* @param Context $context The context to search
|
||||
* @param array $keywords List of keywords
|
||||
* @param string $error a reference to a variable that will
|
||||
* contain an error message if the search service produces
|
||||
* an error.
|
||||
* @param string $publishedFrom Search-from date
|
||||
* @param string $publishedTo Search-to date
|
||||
* @param ?\PKP\db\DBResultRange $rangeInfo Information on the range of results to return
|
||||
* @param array $exclude An array of article IDs to exclude from the result.
|
||||
*
|
||||
* @return VirtualArrayIterator An iterator with one entry per retrieved
|
||||
* article containing the article, published submission, issue, context, etc.
|
||||
*/
|
||||
public function retrieveResults($request, $context, $keywords, &$error, $publishedFrom = null, $publishedTo = null, $rangeInfo = null, $exclude = [])
|
||||
{
|
||||
// Pagination
|
||||
if ($rangeInfo && $rangeInfo->isValid()) {
|
||||
$page = $rangeInfo->getPage();
|
||||
$itemsPerPage = $rangeInfo->getCount();
|
||||
} else {
|
||||
$page = 1;
|
||||
$itemsPerPage = self::SUBMISSION_SEARCH_DEFAULT_RESULT_LIMIT;
|
||||
}
|
||||
|
||||
// Result set ordering.
|
||||
[$orderBy, $orderDir] = $this->getResultSetOrdering($request);
|
||||
|
||||
// Check whether a search plug-in jumps in to provide ranked search results.
|
||||
$totalResults = null;
|
||||
$results = null;
|
||||
$hookResult = Hook::call(
|
||||
'SubmissionSearch::retrieveResults',
|
||||
[&$context, &$keywords, $publishedFrom, $publishedTo, $orderBy, $orderDir, $exclude, $page, $itemsPerPage, &$totalResults, &$error, &$results]
|
||||
);
|
||||
|
||||
// If no search plug-in is activated then fall back to the
|
||||
// default database search implementation.
|
||||
if ($hookResult === false) {
|
||||
// Parse the query.
|
||||
foreach ($keywords as $searchType => $query) {
|
||||
$keywords[$searchType] = $this->_parseQuery($query);
|
||||
}
|
||||
|
||||
// Fetch all the results from all the keywords into one array
|
||||
// (mergedResults), where mergedResults[submission_id]
|
||||
// = sum of all the occurrences for all keywords associated with
|
||||
// that article ID.
|
||||
$mergedResults = $this->_getMergedArray($context, $keywords, $publishedFrom, $publishedTo);
|
||||
|
||||
// Convert mergedResults into an array (frequencyIndicator =>
|
||||
// $submissionId).
|
||||
// The frequencyIndicator is a synthetically-generated number,
|
||||
// where higher is better, indicating the quality of the match.
|
||||
// It is generated here in such a manner that matches with
|
||||
// identical frequency do not collide.
|
||||
$results = $this->getSparseArray($mergedResults, $orderBy, $orderDir, $exclude);
|
||||
$totalResults = count($results);
|
||||
|
||||
// Use only the results for the specified page.
|
||||
$offset = $itemsPerPage * ($page - 1);
|
||||
$length = max($totalResults - $offset, 0);
|
||||
$length = min($itemsPerPage, $length);
|
||||
if ($length == 0) {
|
||||
$results = [];
|
||||
} else {
|
||||
$results = array_slice(
|
||||
$results,
|
||||
$offset,
|
||||
$length
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Take the range of results and retrieve the Article, Journal,
|
||||
// and associated objects.
|
||||
$results = $this->formatResults($results, $request->getUser());
|
||||
|
||||
// Return the appropriate iterator.
|
||||
return new VirtualArrayIterator($results, $totalResults, $page, $itemsPerPage);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the available options for the result
|
||||
* set ordering direction.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getResultSetOrderingDirectionOptions()
|
||||
{
|
||||
return [
|
||||
'asc' => __('search.results.orderDir.asc'),
|
||||
'desc' => __('search.results.orderDir.desc')
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the currently selected result
|
||||
* set ordering option (default: descending relevance).
|
||||
*
|
||||
* @param Request $request
|
||||
*
|
||||
* @return array An array with the order field as the
|
||||
* first entry and the order direction as the second
|
||||
* entry.
|
||||
*/
|
||||
public function getResultSetOrdering($request)
|
||||
{
|
||||
// Order field.
|
||||
$orderBy = $request->getUserVar('orderBy');
|
||||
$orderByOptions = $this->getResultSetOrderingOptions($request);
|
||||
if (is_null($orderBy) || !in_array($orderBy, array_keys($orderByOptions))) {
|
||||
$orderBy = 'score';
|
||||
}
|
||||
|
||||
// Ordering direction.
|
||||
$orderDir = $request->getUserVar('orderDir');
|
||||
$orderDirOptions = $this->getResultSetOrderingDirectionOptions();
|
||||
if (is_null($orderDir) || !in_array($orderDir, array_keys($orderDirOptions))) {
|
||||
$orderDir = $this->getDefaultOrderDir($orderBy);
|
||||
}
|
||||
|
||||
return [$orderBy, $orderDir];
|
||||
}
|
||||
|
||||
//
|
||||
// Methods to be implemented by subclasses.
|
||||
//
|
||||
/**
|
||||
* See implementation of retrieveResults for a description of this
|
||||
* function.
|
||||
*
|
||||
* Note that this function is also called externally to fetch
|
||||
* results for the title index, and possibly elsewhere.
|
||||
*
|
||||
* @param array $results
|
||||
* @param User $user optional (if availability information is desired)
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
abstract public function formatResults($results, $user = null);
|
||||
|
||||
/**
|
||||
* Return the available options for result set ordering.
|
||||
*
|
||||
* @param Request $request
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
abstract public function getResultSetOrderingOptions($request);
|
||||
|
||||
/**
|
||||
* See implementation of retrieveResults for a description of this
|
||||
* function.
|
||||
*/
|
||||
abstract protected function getSparseArray($unorderedResults, $orderBy, $orderDir, $exclude);
|
||||
|
||||
/**
|
||||
* Return the default order direction.
|
||||
*
|
||||
* @param string $orderBy
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
abstract protected function getDefaultOrderDir($orderBy);
|
||||
|
||||
/**
|
||||
* Return the search DAO
|
||||
*
|
||||
* @return DAO
|
||||
*/
|
||||
abstract protected function getSearchDao();
|
||||
}
|
||||
|
||||
if (!PKP_STRICT_MODE) {
|
||||
class_alias('\PKP\search\SubmissionSearch', '\SubmissionSearch');
|
||||
foreach ([
|
||||
'SUBMISSION_SEARCH_AUTHOR',
|
||||
'SUBMISSION_SEARCH_TITLE',
|
||||
'SUBMISSION_SEARCH_ABSTRACT',
|
||||
'SUBMISSION_SEARCH_DISCIPLINE',
|
||||
'SUBMISSION_SEARCH_SUBJECT',
|
||||
'SUBMISSION_SEARCH_KEYWORD',
|
||||
'SUBMISSION_SEARCH_TYPE',
|
||||
'SUBMISSION_SEARCH_COVERAGE',
|
||||
'SUBMISSION_SEARCH_GALLEY_FILE',
|
||||
'SUBMISSION_SEARCH_SUPPLEMENTARY_FILE',
|
||||
'SUBMISSION_SEARCH_INDEX_TERMS',
|
||||
'SUBMISSION_SEARCH_DEFAULT_RESULT_LIMIT',
|
||||
] as $constantName) {
|
||||
define($constantName, constant('\SubmissionSearch::' . $constantName));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file classes/search/SubmissionSearchDAO.php
|
||||
*
|
||||
* Copyright (c) 2014-2021 Simon Fraser University
|
||||
* Copyright (c) 2003-2021 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class SubmissionSearchDAO
|
||||
*
|
||||
* @ingroup search
|
||||
*
|
||||
* @see SubmissionSearch
|
||||
*
|
||||
* @brief DAO class for submission search index.
|
||||
*/
|
||||
|
||||
namespace PKP\search;
|
||||
|
||||
use Illuminate\Database\Query\Builder;
|
||||
use Illuminate\Support\Collection;
|
||||
use Illuminate\Support\Facades\DB;
|
||||
use PKP\core\PKPString;
|
||||
|
||||
class SubmissionSearchDAO extends \PKP\db\DAO
|
||||
{
|
||||
/**
|
||||
* Delete all keywords for a submission.
|
||||
*
|
||||
* @param int $submissionId
|
||||
* @param int $type optional
|
||||
* @param int $assocId optional
|
||||
*/
|
||||
public function deleteSubmissionKeywords($submissionId, $type = null, $assocId = null)
|
||||
{
|
||||
DB::table('submission_search_objects')
|
||||
->where('submission_id', '=', $submissionId)
|
||||
->when(isset($type), fn (Builder $query) => $query->where('type', '=', $type))
|
||||
->when(isset($assocId), fn (Builder $query) => $query->where('assoc_id', '=', $assocId))
|
||||
->delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a submission object to the index (if already exists, indexed keywords are cleared).
|
||||
*
|
||||
* @param int $submissionId
|
||||
* @param int $type
|
||||
* @param ?int $assocId
|
||||
*
|
||||
* @return int the object ID
|
||||
*/
|
||||
public function insertObject($submissionId, $type, $assocId)
|
||||
{
|
||||
$objectId = DB::table('submission_search_objects')
|
||||
->where('submission_id', '=', $submissionId)
|
||||
->where('type', '=', $type)
|
||||
->when($assocId !== null, fn (Builder $query) => $query->where('assoc_id', '=', $assocId))
|
||||
->value('object_id');
|
||||
|
||||
if ($objectId) {
|
||||
// Clear the old keywords
|
||||
DB::table('submission_search_object_keywords')
|
||||
->where('object_id', '=', $objectId)
|
||||
->delete();
|
||||
return $objectId;
|
||||
}
|
||||
|
||||
return DB::table('submission_search_objects')->insertGetId([
|
||||
'submission_id' => $submissionId,
|
||||
'type' => $type,
|
||||
'assoc_id' => $assocId
|
||||
], 'object_id');
|
||||
}
|
||||
|
||||
/**
|
||||
* Index an occurrence of a keyword in an object.
|
||||
*/
|
||||
public function insertObjectKeywords(int $objectId, array $keywords): void
|
||||
{
|
||||
/** @var array<string,?int> */
|
||||
static $keywordMap = [];
|
||||
|
||||
// Discard long keywords
|
||||
$keywords = collect($keywords)
|
||||
->filter(fn (string $keyword) => PKPString::strlen($keyword) <= SubmissionSearchIndex::SEARCH_KEYWORD_MAX_LENGTH);
|
||||
|
||||
// Quit if there's no keywords
|
||||
if (!$keywords->count()) {
|
||||
return;
|
||||
}
|
||||
|
||||
$chunkedUnmappedKeywords = $keywords
|
||||
// Skip mapped keywords
|
||||
->diff(array_keys($keywordMap))
|
||||
// Chunk by 1000
|
||||
->chunk(1000);
|
||||
|
||||
$chunkedUnmappedKeywords->map(function (Collection $keywords) use (&$keywordMap) {
|
||||
$missingKeywords = collect();
|
||||
// Update the map with the existing IDs. Due to the database collation, very similar keywords might end up with the same ID
|
||||
foreach ($this->getKeywordIdMap($keywords) as $keyword => $id) {
|
||||
if ($id) {
|
||||
$keywordMap[$keyword] = $id;
|
||||
} else {
|
||||
$missingKeywords->push($keyword);
|
||||
}
|
||||
}
|
||||
|
||||
// Batch insert keywords that don't exist using the "ignore" feature to deal with collation issues (e.g. attempt to insert "a" and "ã" at the same time might fail)
|
||||
// This isn't executed first just to avoid "burning" IDs due to existing keywords
|
||||
DB::table('submission_search_keyword_list')->insertOrIgnore(
|
||||
$missingKeywords
|
||||
->map(fn (string $keyword) => ['keyword_text' => $keyword])
|
||||
->toArray()
|
||||
);
|
||||
|
||||
// Grab the the map with the new IDs
|
||||
foreach ($this->getKeywordIdMap($missingKeywords) as $keyword => $id) {
|
||||
$keywordMap[$keyword] = $id;
|
||||
}
|
||||
});
|
||||
|
||||
// Get the current position
|
||||
$position = DB::table('submission_search_object_keywords')
|
||||
->where('object_id', $objectId)
|
||||
->max('pos') ?? -1;
|
||||
|
||||
$keywords
|
||||
// Skip missed keywords (probably not needed, present for correctness)
|
||||
->filter(fn (string $keyword) => isset($keywordMap[$keyword]))
|
||||
// Convert to batch insert format
|
||||
->map(function (string $keyword) use (&$position, $objectId, $keywordMap) {
|
||||
return [
|
||||
'object_id' => $objectId,
|
||||
'keyword_id' => $keywordMap[$keyword],
|
||||
'pos' => ++$position
|
||||
];
|
||||
})
|
||||
// Chunk by 1000
|
||||
->chunk(1000)
|
||||
// Batch insert
|
||||
->map(fn (Collection $data) => DB::table('submission_search_object_keywords')->insert($data->toArray()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the search index.
|
||||
*/
|
||||
public function clearIndex()
|
||||
{
|
||||
DB::table('submission_search_objects')->delete();
|
||||
DB::table('submission_search_keyword_list')->delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a keyword => ID map for the given keywords
|
||||
*
|
||||
* @param Collection<int,string>
|
||||
* @return Collection<string,int>
|
||||
*/
|
||||
private function getKeywordIdMap(Collection $keywords): Collection
|
||||
{
|
||||
if (!$keywords->count()) {
|
||||
return collect();
|
||||
}
|
||||
|
||||
// Generates a temporary keyword table (sequence of "SELECT ? AS keyword UNION ALL SELECT ?...")
|
||||
return DB::table(
|
||||
DB::raw('(SELECT ? AS keyword' . str_repeat(' UNION ALL SELECT ?', $keywords->count() - 1) . ') AS tmp')
|
||||
)
|
||||
->setBindings($keywords->toArray(), 'from')
|
||||
->leftJoin('submission_search_keyword_list AS sskl', 'sskl.keyword_text', '=', 'tmp.keyword')
|
||||
->pluck('sskl.keyword_id', 'tmp.keyword');
|
||||
}
|
||||
}
|
||||
|
||||
if (!PKP_STRICT_MODE) {
|
||||
class_alias('\PKP\search\SubmissionSearchDAO', '\SubmissionSearchDAO');
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file classes/search/SubmissionSearchIndex.php
|
||||
*
|
||||
* Copyright (c) 2014-2021 Simon Fraser University
|
||||
* Copyright (c) 2003-2021 John Willinsky
|
||||
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
|
||||
*
|
||||
* @class SubmissionSearchIndex
|
||||
*
|
||||
* @ingroup search
|
||||
*
|
||||
* @brief Class to maintain a submission search index.
|
||||
*/
|
||||
|
||||
namespace PKP\search;
|
||||
|
||||
use APP\submission\Submission;
|
||||
use PKP\config\Config;
|
||||
use PKP\core\PKPString;
|
||||
|
||||
abstract class SubmissionSearchIndex
|
||||
{
|
||||
public const SEARCH_STOPWORDS_FILE = 'lib/pkp/registry/stopwords.txt';
|
||||
|
||||
// Words are truncated to at most this length
|
||||
public const SEARCH_KEYWORD_MAX_LENGTH = 40;
|
||||
|
||||
/**
|
||||
* Split a string into a clean array of keywords
|
||||
*
|
||||
* @param string|array $text
|
||||
* @param bool $allowWildcards
|
||||
*
|
||||
* @return string[] of keywords
|
||||
*/
|
||||
public static function filterKeywords($text, $allowWildcards = false, bool $allowShortWords = false, bool $allowNumericWords = false): array
|
||||
{
|
||||
$minLength = Config::getVar('search', 'min_word_length');
|
||||
$stopwords = static::loadStopwords();
|
||||
|
||||
// Join multiple lines into a single string
|
||||
if (is_array($text)) {
|
||||
$text = join("\n", $text);
|
||||
}
|
||||
|
||||
// Attempts to fix bad UTF-8 characters
|
||||
$previous = mb_substitute_character();
|
||||
mb_substitute_character('none');
|
||||
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
|
||||
mb_substitute_character($previous);
|
||||
|
||||
// Removes all control (C) characters, marks (M), punctuations (P), symbols (S) and separators (Z) except "*" (which is addressed below)
|
||||
$text = PKPString::regexp_replace('/(?!\*)[\\p{C}\\p{M}\\p{P}\\p{S}\\p{Z}]+/', ' ', $text);
|
||||
$text = PKPString::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $text);
|
||||
$text = PKPString::strtolower($text);
|
||||
|
||||
// Split into words
|
||||
$words = PKPString::regexp_split('/\s+/', $text);
|
||||
|
||||
// FIXME Do not perform further filtering for some fields, e.g., author names?
|
||||
|
||||
$keywords = [];
|
||||
foreach ($words as $word) {
|
||||
// Ignores: stop words, short words (when $allowShortWords is false) and words composed solely of numbers (when $allowNumericWords is false)
|
||||
if (empty($stopwords[$word]) && ($allowShortWords || PKPString::strlen($word) >= $minLength) && ($allowNumericWords || !is_numeric($word))) {
|
||||
$keywords[] = PKPString::substr($word, 0, static::SEARCH_KEYWORD_MAX_LENGTH);
|
||||
}
|
||||
}
|
||||
return $keywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return list of stopwords.
|
||||
* FIXME: Should this be locale-specific?
|
||||
*
|
||||
* @return array<string,int> Stop words (in lower case) as keys and 1 as value
|
||||
*/
|
||||
protected static function loadStopwords()
|
||||
{
|
||||
static $searchStopwords;
|
||||
|
||||
return $searchStopwords ??= array_fill_keys(
|
||||
collect(file(base_path(static::SEARCH_STOPWORDS_FILE)))
|
||||
->map(fn (string $word) => trim($word))
|
||||
// Ignore comments/line-breaks
|
||||
->filter(fn (string $word) => !empty($word) && $word[0] !== '#')
|
||||
// Include a map for empty words
|
||||
->push('')
|
||||
->toArray(),
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Let the indexing back-end know that the current transaction
|
||||
* finished so that the index can be batch-updated.
|
||||
*/
|
||||
abstract public function submissionChangesFinished();
|
||||
|
||||
/**
|
||||
* Signal to the indexing back-end that the metadata of a submission
|
||||
* changed.
|
||||
*
|
||||
* Push indexing implementations will try to immediately update
|
||||
* the index to reflect the changes. Pull implementations will
|
||||
* mark articles as "changed" and let the indexing back-end decide
|
||||
* the best point in time to actually index the changed data.
|
||||
*
|
||||
* @param Submission $submission
|
||||
*/
|
||||
abstract public function submissionMetadataChanged($submission);
|
||||
|
||||
/**
|
||||
* Remove indexed file contents for a submission
|
||||
*
|
||||
* @param Submission $submission
|
||||
*/
|
||||
abstract public function clearSubmissionFiles($submission);
|
||||
|
||||
/**
|
||||
* Delete a submission's search indexing
|
||||
*
|
||||
* @param int $type optional
|
||||
* @param int $assocId optional
|
||||
*/
|
||||
abstract public function deleteTextIndex(
|
||||
int $submissionId,
|
||||
$type = null,
|
||||
$assocId = null
|
||||
);
|
||||
}
|
||||
|
||||
if (!PKP_STRICT_MODE) {
|
||||
class_alias('\PKP\search\SubmissionSearchIndex', '\SubmissionSearchIndex');
|
||||
foreach (['SEARCH_STOPWORDS_FILE', 'SEARCH_KEYWORD_MAX_LENGTH'] as $constantName) {
|
||||
define($constantName, constant('\SubmissionSearchIndex::' . $constantName));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user