766 lines
27 KiB
PHP
766 lines
27 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
/**
|
||
* Crossref API 工具类
|
||
*
|
||
* 说明:
|
||
* - 仿照 application/api/controller/Crossrefdoi.php 的实现风格抽成 Service
|
||
* - 仅做「请求 + 解析」;不包含任何数据库读写
|
||
*/
|
||
class CrossrefService
|
||
{
|
||
// 配置项
|
||
private $mailto = ''; // 邮箱(提升优先级)
|
||
private $timeout = 15; // 请求超时(秒)
|
||
private $maxRetry = 2; // 单个DOI最大重试次数
|
||
private $crossrefUrl = "https://api.crossref.org/works/"; // 接口地址
|
||
|
||
public function __construct($config = [])
|
||
{
|
||
if (is_array($config)) {
|
||
if (isset($config['mailto'])) $this->mailto = (string)$config['mailto'];
|
||
if (isset($config['timeout'])) $this->timeout = intval($config['timeout']);
|
||
if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']);
|
||
if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl'];
|
||
}
|
||
}
|
||
|
||
public function setMailto($mailto)
|
||
{
|
||
$this->mailto = (string)$mailto;
|
||
return $this;
|
||
}
|
||
|
||
/**
|
||
* 引用标号转换工具:正文里的 [n] 对应 production_article_refer.index = n-1(index 从 0 开始)。
|
||
*
|
||
* @param int $citationMark 正文引用编号,如 13(来自 [13])
|
||
* @return int production_article_refer.index,如 12
|
||
*/
|
||
public function referIndexFromCitationMark(int $citationMark): int
|
||
{
|
||
$citationMark = intval($citationMark);
|
||
return max(0, $citationMark - 1);
|
||
}
|
||
|
||
/**
|
||
* 反向转换工具:production_article_refer.index(从 0 开始)→ 正文引用编号 [n]。
|
||
*
|
||
* @param int $referIndex production_article_refer.index,如 12
|
||
* @return int 正文引用编号 n,如 13
|
||
*/
|
||
public function citationMarkFromReferIndex(int $referIndex): int
|
||
{
|
||
$referIndex = intval($referIndex);
|
||
return max(0, $referIndex + 1);
|
||
}
|
||
|
||
/**
|
||
* 批量引用质检(不查库版):\n
|
||
* - 输入文章分节内容(t_article_main 的 content 列表)\n
|
||
* - 输入引用条目(production_article_refer 的行列表)\n
|
||
* - 自动抽取每个 [n] 的英文句子上下文,并映射到 refer.index=n-1 后调用 qcCitation()\n
|
||
*
|
||
* 说明:本方法不做任何数据库查询,方便你在 controller/service 中自由组合数据来源。
|
||
*
|
||
* @param array $articleMainContents 文章内容片段数组(按 sort 顺序),元素为 string 或含 content 的数组
|
||
* @param array $referRows production_article_refer 行数组(至少含 index/title/author/joura/dateno/refer_doi/doilink)
|
||
* @param array $options 透传给 qcCitation 的 options,并支持:
|
||
* - sentence_window(int) 上下文句子窗口,默认 1(即前1句+本句+后1句)
|
||
* @return array 结果列表,每条包含 citation_mark/refer_index/context/ref_meta/qc
|
||
*/
|
||
public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array
|
||
{
|
||
$window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1;
|
||
|
||
// 1) 组装全文纯文本(保留 [n])
|
||
$chunks = [];
|
||
foreach ($articleMainContents as $row) {
|
||
if (is_array($row)) {
|
||
$text = (string)($row['content'] ?? '');
|
||
} else {
|
||
$text = (string)$row;
|
||
}
|
||
if ($text === '') continue;
|
||
// 去掉常见标签,保留 [n]
|
||
$text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text);
|
||
$text = strip_tags($text);
|
||
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
||
$text = preg_replace('/\s+/u', ' ', trim($text));
|
||
if ($text !== '') $chunks[] = $text;
|
||
}
|
||
$fullText = implode("\n", $chunks);
|
||
|
||
if ($fullText === '') return [];
|
||
|
||
// 2) 构建引用条目映射:refer_index => row
|
||
$referMap = [];
|
||
foreach ($referRows as $r) {
|
||
if (!is_array($r)) continue;
|
||
if (!isset($r['index'])) continue;
|
||
$idx = intval($r['index']);
|
||
$referMap[$idx] = $r;
|
||
}
|
||
|
||
// 3) 英文切句(简单稳健版)
|
||
$sentences = $this->splitEnglishSentences($fullText);
|
||
if (empty($sentences)) return [];
|
||
|
||
// 4) 遍历句子,抓取其中的 [n]
|
||
$results = [];
|
||
foreach ($sentences as $si => $sent) {
|
||
if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) {
|
||
continue;
|
||
}
|
||
$marks = array_unique(array_map('intval', $m[1]));
|
||
foreach ($marks as $citationMark) {
|
||
if ($citationMark <= 0) continue;
|
||
$referIndex = $this->referIndexFromCitationMark($citationMark);
|
||
if (!isset($referMap[$referIndex])) {
|
||
continue;
|
||
}
|
||
|
||
$start = max(0, $si - $window);
|
||
$end = min(count($sentences) - 1, $si + $window);
|
||
$ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1));
|
||
$ctx = trim(preg_replace('/\s+/u', ' ', $ctx));
|
||
|
||
$refMeta = $referMap[$referIndex];
|
||
$qc = $this->qcCitation($ctx, $refMeta, $options);
|
||
|
||
$results[] = [
|
||
'citation_mark' => $citationMark, // 正文编号 n(来自 [n])
|
||
'refer_index' => $referIndex, // production_article_refer.index
|
||
'context' => $ctx,
|
||
'ref_meta' => [
|
||
'p_refer_id' => $refMeta['p_refer_id'] ?? 0,
|
||
'title' => $refMeta['title'] ?? '',
|
||
'author' => $refMeta['author'] ?? '',
|
||
'joura' => $refMeta['joura'] ?? '',
|
||
'dateno' => $refMeta['dateno'] ?? '',
|
||
'refer_doi' => $refMeta['refer_doi'] ?? '',
|
||
'doilink' => $refMeta['doilink'] ?? '',
|
||
'index' => $refMeta['index'] ?? $referIndex,
|
||
],
|
||
'qc' => $qc,
|
||
];
|
||
}
|
||
}
|
||
|
||
return $results;
|
||
}
|
||
|
||
/**
|
||
* 过滤非法DOI(仅保留10.xxxx/xxx格式)
|
||
* @param string $doi
|
||
* @return string
|
||
*/
|
||
public function filterValidDoi($doi = '')
|
||
{
|
||
$doi = trim((string)$doi);
|
||
if ($doi === '') return '';
|
||
if (preg_match('/^10\.\d{4,}\/.+/', $doi)) {
|
||
return $doi;
|
||
}
|
||
return '';
|
||
}
|
||
|
||
/**
|
||
* 获取 Crossref message(带重试)
|
||
* @param string $doi
|
||
* @return array|null
|
||
*/
|
||
public function fetchWork($doi)
|
||
{
|
||
$doi = $this->filterValidDoi($doi);
|
||
if ($doi === '') return null;
|
||
return $this->fetchSingleDoiWithRetry($doi);
|
||
}
|
||
|
||
/**
|
||
* 返回常用字段集合(标题/期刊/作者/卷期页/撤稿/URL)
|
||
* @param string $doi
|
||
* @return array|null
|
||
*/
|
||
public function fetchWorkSummary($doi)
|
||
{
|
||
$msg = $this->fetchWork($doi);
|
||
if (!$msg) return null;
|
||
|
||
$title = $this->getTitle($msg);
|
||
$publisher = $this->getPublisher($msg);
|
||
$joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? '');
|
||
$authors = $this->getAuthors($msg);
|
||
$dateno = $this->getVolumeIssuePages($msg);
|
||
$retractInfo = $this->checkRetracted($msg);
|
||
$dolink = $this->getDolink($msg);
|
||
if (empty($dolink)) {
|
||
$dolink = 'https://doi.org/' . $this->filterValidDoi($doi);
|
||
}
|
||
|
||
return [
|
||
'doi' => $this->filterValidDoi($doi),
|
||
'title' => $title,
|
||
'joura' => $joura,
|
||
'publisher' => $publisher,
|
||
'authors' => $authors,
|
||
'author_str' => empty($authors) ? '' : implode(',', $authors),
|
||
'dateno' => $dateno,
|
||
'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0,
|
||
'retract_reason' => $retractInfo['reason'] ?? '',
|
||
'doilink' => $dolink,
|
||
'raw' => $msg,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 单DOI查询(带重试)
|
||
* @param string $doi
|
||
* @return array|null
|
||
*/
|
||
private function fetchSingleDoiWithRetry($doi)
|
||
{
|
||
$retryCount = 0;
|
||
while ($retryCount < $this->maxRetry) {
|
||
$url = $this->crossrefUrl . rawurlencode($doi);
|
||
if (!empty($this->mailto)) {
|
||
$url .= "?mailto=" . rawurlencode($this->mailto);
|
||
}
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, [
|
||
"User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})"
|
||
]);
|
||
$response = curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
curl_close($ch);
|
||
|
||
if ($httpCode == 200) {
|
||
$data = json_decode($response, true);
|
||
return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null;
|
||
}
|
||
|
||
if ($httpCode == 429) {
|
||
sleep(5);
|
||
$retryCount++;
|
||
continue;
|
||
}
|
||
|
||
$retryCount++;
|
||
sleep(1);
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* 提取标题
|
||
*/
|
||
public function getTitle($aDoiInfo = [])
|
||
{
|
||
return $aDoiInfo['title'][0] ?? '';
|
||
}
|
||
|
||
/**
|
||
* 提取期刊/出版社相关信息
|
||
*/
|
||
public function getPublisher($aDoiInfo = [])
|
||
{
|
||
return [
|
||
'title' => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '',
|
||
'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '',
|
||
'ISSN' => $aDoiInfo['ISSN'] ?? [],
|
||
'publisher' => $aDoiInfo['publisher'] ?? '',
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 提取作者列表
|
||
*/
|
||
public function getAuthors($aDoiInfo = [])
|
||
{
|
||
$authors = [];
|
||
if (!empty($aDoiInfo['author'])) {
|
||
foreach ($aDoiInfo['author'] as $author) {
|
||
$name = $author['family'] ?? '';
|
||
if (!empty($author['given'])) {
|
||
$name = $author['given'] . ' ' . $name;
|
||
}
|
||
if (!empty($name)) {
|
||
$authors[] = $name;
|
||
}
|
||
}
|
||
}
|
||
return $authors;
|
||
}
|
||
|
||
/**
|
||
* 提取发表年份
|
||
*/
|
||
public function getPublishYear($aDoiInfo = [])
|
||
{
|
||
if (!empty($aDoiInfo['issued']['date-parts'][0][0])) {
|
||
return (string)$aDoiInfo['issued']['date-parts'][0][0];
|
||
}
|
||
return '';
|
||
}
|
||
|
||
/**
|
||
* 提取卷(期):起始页-终止页(格式:2024:10(2):100-120)
|
||
*/
|
||
public function getVolumeIssuePages($aDoiInfo = [])
|
||
{
|
||
$parts = [];
|
||
|
||
$year = $this->getPublishYear($aDoiInfo);
|
||
if ($year) $parts[] = $year;
|
||
|
||
$volume = $aDoiInfo['volume'] ?? '';
|
||
$issue = $aDoiInfo['issue'] ?? '';
|
||
if ($volume) {
|
||
$parts[] = $volume . ($issue ? "({$issue})" : '');
|
||
}
|
||
|
||
$pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? '');
|
||
$pageEnd = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? '');
|
||
$pages = '';
|
||
if ($pageStart) {
|
||
$pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : '');
|
||
} else {
|
||
$pages = $aDoiInfo['page'] ?? '';
|
||
}
|
||
if ($pages) $parts[] = $pages;
|
||
|
||
return implode(':', $parts);
|
||
}
|
||
|
||
/**
|
||
* 识别撤稿文章(与 Crossrefdoi.php 同逻辑)
|
||
*/
|
||
public function checkRetracted($aDoiInfo = [])
|
||
{
|
||
$isRetracted = false;
|
||
$reason = "未撤稿";
|
||
|
||
$sType = strtolower($aDoiInfo['type'] ?? '');
|
||
$sSubtype = strtolower($aDoiInfo['subtype'] ?? '');
|
||
if ($sType && in_array($sType, ['retraction', 'correction'])) {
|
||
$isRetracted = true;
|
||
$reason = "文章类型为{$sType}(撤稿/更正声明)";
|
||
}
|
||
if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) {
|
||
$isRetracted = true;
|
||
$reason = "文章类型为{$sSubtype}(撤稿/更正声明)";
|
||
}
|
||
|
||
if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) {
|
||
$isRetracted = true;
|
||
$reason = "官方标记为撤稿(update-type: retraction)";
|
||
}
|
||
|
||
if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) {
|
||
foreach ($aDoiInfo['relation'] as $relType => $relItems) {
|
||
if (in_array($relType, ['is-retraction-of', 'corrects'])) {
|
||
$isRetracted = true;
|
||
$relatedDoi = $relItems[0]['id'] ?? '未知';
|
||
$reason = "关联撤稿文章{$relatedDoi}(关系:{$relType})";
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) {
|
||
foreach ($aDoiInfo['update-to'] as $update) {
|
||
$updateType = strtolower($update['type'] ?? '');
|
||
$updateLabel = strtolower($update['label'] ?? '');
|
||
if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
|
||
$isRetracted = true;
|
||
$reason = "update-to 标记撤稿({$updateType}/{$updateLabel})";
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
$aTitles = $aDoiInfo['title'] ?? [];
|
||
foreach ($aTitles as $value) {
|
||
$sTitleLower = strtolower($value);
|
||
if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false
|
||
|| strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) {
|
||
$isRetracted = true;
|
||
$reason = "标题包含撤稿关键词";
|
||
break;
|
||
}
|
||
}
|
||
|
||
return [
|
||
'is_retracted' => $isRetracted,
|
||
'reason' => $reason,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 识别 doi 链接
|
||
*/
|
||
public function getDolink($aDoiInfo = [])
|
||
{
|
||
return $aDoiInfo['URL'] ?? '';
|
||
}
|
||
|
||
/**
|
||
* 解析 Crossref date-parts
|
||
*/
|
||
public function parseDateParts($dateObj)
|
||
{
|
||
$parts = $dateObj['date-parts'][0] ?? [];
|
||
if (empty($parts)) return '';
|
||
$y = $parts[0] ?? '';
|
||
$m = $parts[1] ?? '';
|
||
$d = $parts[2] ?? '';
|
||
$out = (string)$y;
|
||
if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT);
|
||
if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT);
|
||
return $out;
|
||
}
|
||
|
||
/**
|
||
* 引用质检:判断(1) 被引条目是否疑似退稿/更正(基于 Crossref);(2) 引用上下文是否与被引条目相关(基于证据命中)。
|
||
*
|
||
* 说明:
|
||
* - 适用于没有 abstract/keywords 的场景(仅用 title/author/journal/year + 引用上下文句子)。
|
||
* - 如果 refer_doi/doilink 为空,则 problem_flag 只能返回 unknown。
|
||
*
|
||
* @param string $contextText 引用处的上下文句子(英文,最好只包含引用所在句 + 少量相邻句)
|
||
* @param array $refMeta 被引条目元信息(建议来自 production_article_refer)
|
||
* - refer_doi / doilink / title / author / joura / dateno
|
||
* @param array $options 可选参数
|
||
* - check_retraction(bool) 是否调用 Crossref 判断退稿/更正;默认 true
|
||
* - background_phrases(array) 背景堆引用触发短语;默认使用内置
|
||
*
|
||
* @return array
|
||
* [
|
||
* 'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown',
|
||
* 'problem_reason' => string,
|
||
* 'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated',
|
||
* 'relevance_score' => float,
|
||
* 'reason' => string
|
||
* ]
|
||
*/
|
||
public function qcCitation(string $contextText, array $refMeta, array $options = []): array
|
||
{
|
||
$contextText = trim($contextText);
|
||
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
|
||
|
||
$refTitle = (string)($refMeta['title'] ?? '');
|
||
$refAuthor = (string)($refMeta['author'] ?? '');
|
||
$refJoura = (string)($refMeta['joura'] ?? '');
|
||
$refDateno = (string)($refMeta['dateno'] ?? '');
|
||
$referDoi = (string)($refMeta['refer_doi'] ?? '');
|
||
$doilink = (string)($refMeta['doilink'] ?? '');
|
||
|
||
$doi = $this->extractDoiFromMeta($referDoi, $doilink);
|
||
|
||
// 1) 退稿/更正判断(强规则,影响 problem_flag)
|
||
$problemFlag = 'unknown';
|
||
$problemReason = '';
|
||
if ($checkRetraction) {
|
||
if (!empty($doi)) {
|
||
$summary = $this->fetchWorkSummary($doi);
|
||
if ($summary && isset($summary['is_retracted'])) {
|
||
if ((int)$summary['is_retracted'] === 1) {
|
||
$problemFlag = 'retracted_or_corrected';
|
||
$problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction';
|
||
} else {
|
||
$problemFlag = 'ok';
|
||
$problemReason = 'Crossref indicates not retracted/corrected';
|
||
}
|
||
} else {
|
||
$problemFlag = 'unknown';
|
||
$problemReason = 'Crossref fetch failed or returned unexpected data';
|
||
}
|
||
} else {
|
||
$problemFlag = 'unknown';
|
||
$problemReason = 'DOI is empty';
|
||
}
|
||
} else {
|
||
$problemFlag = 'unknown';
|
||
$problemReason = 'Skip retraction check';
|
||
}
|
||
|
||
// 2) 相关性判断(弱规则+证据命中)
|
||
$backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [
|
||
'several studies',
|
||
'many studies',
|
||
'the literature',
|
||
'the existing literature',
|
||
'has been reported',
|
||
'have been reported',
|
||
'it has been shown',
|
||
'previous studies',
|
||
'the study suggests',
|
||
'the literature suggests',
|
||
'in the literature',
|
||
];
|
||
|
||
$ctxLower = strtolower($contextText);
|
||
$isBackground = false;
|
||
foreach ($backgroundPhrases as $ph) {
|
||
$ph = strtolower(trim((string)$ph));
|
||
if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) {
|
||
$isBackground = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
$refTokens = $this->buildEvidenceTokens([
|
||
'title' => $refTitle,
|
||
'author' => $refAuthor,
|
||
'journal' => $refJoura,
|
||
'year' => $refDateno,
|
||
]);
|
||
|
||
$ctxTokens = $this->tokenize($contextText);
|
||
|
||
$titleOverlap = 0.0;
|
||
$authorHit = 0.0;
|
||
$journalOverlap = 0.0;
|
||
$yearHit = 0.0;
|
||
|
||
$titleTokens = $refTokens['titleTokens'] ?? [];
|
||
$authorTokens = $refTokens['authorTokens'] ?? [];
|
||
$journalTokens = $refTokens['journalTokens'] ?? [];
|
||
$yearToken = $refTokens['yearToken'] ?? '';
|
||
|
||
if (!empty($titleTokens)) {
|
||
$inter = array_intersect($titleTokens, $ctxTokens);
|
||
$titleOverlap = count($inter) / max(1, count($titleTokens));
|
||
}
|
||
|
||
if (!empty($authorTokens)) {
|
||
foreach ($authorTokens as $at) {
|
||
if ($at !== '' && in_array($at, $ctxTokens, true)) {
|
||
$authorHit = 1.0;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!empty($journalTokens)) {
|
||
$interJ = array_intersect($journalTokens, $ctxTokens);
|
||
$journalOverlap = count($interJ) / max(1, count($journalTokens));
|
||
}
|
||
|
||
if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) {
|
||
$yearHit = 1.0;
|
||
}
|
||
|
||
// 综合得分(保持解释性:越高越相关)
|
||
$score = round((
|
||
0.60 * $titleOverlap +
|
||
0.20 * $authorHit +
|
||
0.15 * $yearHit +
|
||
0.05 * $journalOverlap
|
||
), 4);
|
||
|
||
$relevanceFlag = 'unsure';
|
||
$reasonParts = [];
|
||
|
||
if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) {
|
||
$relevanceFlag = 'related';
|
||
$reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap;
|
||
} elseif ($score >= 0.25) {
|
||
$relevanceFlag = 'unsure';
|
||
$reasonParts[] = 'evidence_score_mid=' . $score;
|
||
} else {
|
||
if ($isBackground) {
|
||
$relevanceFlag = 'unsure_background';
|
||
$reasonParts[] = 'background_phrases_detected';
|
||
} else {
|
||
$relevanceFlag = 'suspicious_unrelated';
|
||
$reasonParts[] = 'evidence_score_low=' . $score;
|
||
}
|
||
}
|
||
|
||
$reasonParts[] = 'titleOverlap=' . $titleOverlap;
|
||
$reasonParts[] = 'authorHit=' . $authorHit;
|
||
$reasonParts[] = 'yearHit=' . $yearHit;
|
||
$reasonParts[] = 'journalOverlap=' . $journalOverlap;
|
||
|
||
$reason = implode('; ', $reasonParts);
|
||
|
||
return [
|
||
'problem_flag' => $problemFlag,
|
||
'problem_reason' => $problemReason,
|
||
'relevance_flag' => $relevanceFlag,
|
||
'relevance_score' => (float)$score,
|
||
'reason' => $reason,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 从 refer_doi / doilink 中抽取 DOI 字符串。
|
||
* @param string $referDoi
|
||
* @param string $doilink
|
||
* @return string
|
||
*/
|
||
private function extractDoiFromMeta(string $referDoi, string $doilink): string
|
||
{
|
||
$doi = trim($referDoi);
|
||
if (!empty($doi)) {
|
||
return $this->filterValidDoi($doi);
|
||
}
|
||
|
||
$link = trim($doilink);
|
||
if ($link === '') return '';
|
||
|
||
// 常见:https://doi.org/10.xxxx/xxxx 或 http://doi.org/...
|
||
if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) {
|
||
$candidate = trim((string)$m[1]);
|
||
return $this->filterValidDoi($candidate);
|
||
}
|
||
|
||
// 兜底:如果doilink本身就是doi格式
|
||
return $this->filterValidDoi($link);
|
||
}
|
||
|
||
/**
|
||
* 构建证据 token(用于证据命中/相似度粗判)
|
||
* @param array $src
|
||
* @return array
|
||
*/
|
||
private function buildEvidenceTokens(array $src): array
|
||
{
|
||
$stop = [
|
||
'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are',
|
||
'was','were','be','been','being','that','this','these','those','which','who','whom','it','its',
|
||
'we','our','us','they','their','them','i','you','your','he','she','his','her',
|
||
'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods',
|
||
'results','result','using','used','show','shown','demonstrated','demonstrate',
|
||
];
|
||
|
||
$titleTokens = $this->tokenize((string)($src['title'] ?? ''));
|
||
$titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) {
|
||
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
|
||
}));
|
||
|
||
$authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? ''));
|
||
$authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) {
|
||
$t = trim($t);
|
||
if ($t === '') return '';
|
||
if (in_array($t, $stop, true)) return '';
|
||
return $t;
|
||
}, $authorTokens))));
|
||
|
||
$journalTokens = $this->tokenize((string)($src['journal'] ?? ''));
|
||
$journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) {
|
||
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
|
||
}));
|
||
|
||
$yearToken = '';
|
||
$yearRaw = (string)($src['year'] ?? '');
|
||
if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) {
|
||
$yearToken = (string)$m[1];
|
||
}
|
||
|
||
return [
|
||
'titleTokens' => $titleTokens,
|
||
'authorTokens' => $authorTokens,
|
||
'journalTokens' => $journalTokens,
|
||
'yearToken' => $yearToken,
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 提取作者姓/缩写 token(简化版)
|
||
* @param string $authorStr
|
||
* @return array
|
||
*/
|
||
private function extractAuthorTokens(string $authorStr): array
|
||
{
|
||
$authorStr = trim($authorStr);
|
||
if ($authorStr === '') return [];
|
||
|
||
// 把常见分隔符拆开
|
||
$parts = preg_split('/[,;]| and /i', $authorStr);
|
||
$tokens = [];
|
||
foreach ($parts as $p) {
|
||
$p = trim((string)$p);
|
||
if ($p === '') continue;
|
||
|
||
// 取最后一个词当作姓(例如 "Smith J" -> "Smith"),或取首段词
|
||
$words = preg_split('/\s+/', $p);
|
||
if (empty($words)) continue;
|
||
|
||
$cand = trim((string)end($words));
|
||
if ($cand === '') $cand = trim((string)($words[0] ?? ''));
|
||
|
||
// 只保留字母/点号(去掉异常符号)
|
||
$cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand);
|
||
$cand = strtolower($cand);
|
||
if ($cand !== '') {
|
||
$tokens[] = $cand;
|
||
}
|
||
}
|
||
|
||
// 去掉过短的 token
|
||
$tokens = array_values(array_filter(array_unique($tokens), function ($t) {
|
||
return mb_strlen($t) >= 4;
|
||
}));
|
||
|
||
return $tokens;
|
||
}
|
||
|
||
/**
|
||
* 文本 tokenize(英文下的轻量分词)
|
||
* @param string $text
|
||
* @return array
|
||
*/
|
||
private function tokenize(string $text): array
|
||
{
|
||
$text = strtolower(trim($text));
|
||
if ($text === '') return [];
|
||
|
||
$parts = preg_split('/[^a-z0-9]+/i', $text);
|
||
$tokens = [];
|
||
foreach ($parts as $p) {
|
||
$p = trim((string)$p);
|
||
if ($p === '') continue;
|
||
// 保留较有信息量的 token
|
||
if (mb_strlen($p) < 3) continue;
|
||
$tokens[] = $p;
|
||
}
|
||
|
||
return array_values(array_unique($tokens));
|
||
}
|
||
|
||
/**
|
||
* 英文切句(轻量实现):按 .?! 分割,同时保留句内的 [n]。
|
||
* @param string $text
|
||
* @return array
|
||
*/
|
||
private function splitEnglishSentences(string $text): array
|
||
{
|
||
$text = trim($text);
|
||
if ($text === '') return [];
|
||
|
||
// 先把换行统一为空格,避免断句被打断
|
||
$text = preg_replace('/\s+/u', ' ', $text);
|
||
|
||
// 按句末标点断句:. ? ! 后面跟空格/结尾
|
||
$parts = preg_split('/(?<=[\.\?\!])\s+/', $text);
|
||
$sentences = [];
|
||
foreach ($parts as $p) {
|
||
$p = trim((string)$p);
|
||
if ($p === '') continue;
|
||
$sentences[] = $p;
|
||
}
|
||
return $sentences;
|
||
}
|
||
}
|
||
|