Files
tougao/application/common/CrossrefService.php
2026-04-03 11:45:45 +08:00

766 lines
27 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
/**
* Crossref API 工具类
*
* 说明:
* - 仿照 application/api/controller/Crossrefdoi.php 的实现风格抽成 Service
* - 仅做「请求 + 解析」;不包含任何数据库读写
*/
class CrossrefService
{
// 配置项
private $mailto = ''; // 邮箱(提升优先级)
private $timeout = 15; // 请求超时(秒)
private $maxRetry = 2; // 单个DOI最大重试次数
private $crossrefUrl = "https://api.crossref.org/works/"; // 接口地址
public function __construct($config = [])
{
if (is_array($config)) {
if (isset($config['mailto'])) $this->mailto = (string)$config['mailto'];
if (isset($config['timeout'])) $this->timeout = intval($config['timeout']);
if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']);
if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl'];
}
}
public function setMailto($mailto)
{
$this->mailto = (string)$mailto;
return $this;
}
/**
* 引用标号转换工具:正文里的 [n] 对应 production_article_refer.index = n-1index 从 0 开始)。
*
* @param int $citationMark 正文引用编号,如 13来自 [13]
* @return int production_article_refer.index如 12
*/
public function referIndexFromCitationMark(int $citationMark): int
{
$citationMark = intval($citationMark);
return max(0, $citationMark - 1);
}
/**
* 反向转换工具production_article_refer.index从 0 开始)→ 正文引用编号 [n]。
*
* @param int $referIndex production_article_refer.index如 12
* @return int 正文引用编号 n如 13
*/
public function citationMarkFromReferIndex(int $referIndex): int
{
$referIndex = intval($referIndex);
return max(0, $referIndex + 1);
}
/**
* 批量引用质检(不查库版):\n
* - 输入文章分节内容t_article_main 的 content 列表)\n
* - 输入引用条目production_article_refer 的行列表)\n
* - 自动抽取每个 [n] 的英文句子上下文,并映射到 refer.index=n-1 后调用 qcCitation()\n
*
* 说明:本方法不做任何数据库查询,方便你在 controller/service 中自由组合数据来源。
*
* @param array $articleMainContents 文章内容片段数组(按 sort 顺序),元素为 string 或含 content 的数组
* @param array $referRows production_article_refer 行数组(至少含 index/title/author/joura/dateno/refer_doi/doilink
* @param array $options 透传给 qcCitation 的 options并支持
* - sentence_window(int) 上下文句子窗口,默认 1即前1句+本句+后1句
* @return array 结果列表,每条包含 citation_mark/refer_index/context/ref_meta/qc
*/
public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array
{
$window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1;
// 1) 组装全文纯文本(保留 [n]
$chunks = [];
foreach ($articleMainContents as $row) {
if (is_array($row)) {
$text = (string)($row['content'] ?? '');
} else {
$text = (string)$row;
}
if ($text === '') continue;
// 去掉常见标签,保留 [n]
$text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text);
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', trim($text));
if ($text !== '') $chunks[] = $text;
}
$fullText = implode("\n", $chunks);
if ($fullText === '') return [];
// 2) 构建引用条目映射refer_index => row
$referMap = [];
foreach ($referRows as $r) {
if (!is_array($r)) continue;
if (!isset($r['index'])) continue;
$idx = intval($r['index']);
$referMap[$idx] = $r;
}
// 3) 英文切句(简单稳健版)
$sentences = $this->splitEnglishSentences($fullText);
if (empty($sentences)) return [];
// 4) 遍历句子,抓取其中的 [n]
$results = [];
foreach ($sentences as $si => $sent) {
if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) {
continue;
}
$marks = array_unique(array_map('intval', $m[1]));
foreach ($marks as $citationMark) {
if ($citationMark <= 0) continue;
$referIndex = $this->referIndexFromCitationMark($citationMark);
if (!isset($referMap[$referIndex])) {
continue;
}
$start = max(0, $si - $window);
$end = min(count($sentences) - 1, $si + $window);
$ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1));
$ctx = trim(preg_replace('/\s+/u', ' ', $ctx));
$refMeta = $referMap[$referIndex];
$qc = $this->qcCitation($ctx, $refMeta, $options);
$results[] = [
'citation_mark' => $citationMark, // 正文编号 n来自 [n]
'refer_index' => $referIndex, // production_article_refer.index
'context' => $ctx,
'ref_meta' => [
'p_refer_id' => $refMeta['p_refer_id'] ?? 0,
'title' => $refMeta['title'] ?? '',
'author' => $refMeta['author'] ?? '',
'joura' => $refMeta['joura'] ?? '',
'dateno' => $refMeta['dateno'] ?? '',
'refer_doi' => $refMeta['refer_doi'] ?? '',
'doilink' => $refMeta['doilink'] ?? '',
'index' => $refMeta['index'] ?? $referIndex,
],
'qc' => $qc,
];
}
}
return $results;
}
/**
* 过滤非法DOI仅保留10.xxxx/xxx格式
* @param string $doi
* @return string
*/
public function filterValidDoi($doi = '')
{
$doi = trim((string)$doi);
if ($doi === '') return '';
if (preg_match('/^10\.\d{4,}\/.+/', $doi)) {
return $doi;
}
return '';
}
/**
* 获取 Crossref message带重试
* @param string $doi
* @return array|null
*/
public function fetchWork($doi)
{
$doi = $this->filterValidDoi($doi);
if ($doi === '') return null;
return $this->fetchSingleDoiWithRetry($doi);
}
/**
* 返回常用字段集合(标题/期刊/作者/卷期页/撤稿/URL
* @param string $doi
* @return array|null
*/
public function fetchWorkSummary($doi)
{
$msg = $this->fetchWork($doi);
if (!$msg) return null;
$title = $this->getTitle($msg);
$publisher = $this->getPublisher($msg);
$joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? '');
$authors = $this->getAuthors($msg);
$dateno = $this->getVolumeIssuePages($msg);
$retractInfo = $this->checkRetracted($msg);
$dolink = $this->getDolink($msg);
if (empty($dolink)) {
$dolink = 'https://doi.org/' . $this->filterValidDoi($doi);
}
return [
'doi' => $this->filterValidDoi($doi),
'title' => $title,
'joura' => $joura,
'publisher' => $publisher,
'authors' => $authors,
'author_str' => empty($authors) ? '' : implode(',', $authors),
'dateno' => $dateno,
'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0,
'retract_reason' => $retractInfo['reason'] ?? '',
'doilink' => $dolink,
'raw' => $msg,
];
}
/**
* 单DOI查询带重试
* @param string $doi
* @return array|null
*/
private function fetchSingleDoiWithRetry($doi)
{
$retryCount = 0;
while ($retryCount < $this->maxRetry) {
$url = $this->crossrefUrl . rawurlencode($doi);
if (!empty($this->mailto)) {
$url .= "?mailto=" . rawurlencode($this->mailto);
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
"User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})"
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode == 200) {
$data = json_decode($response, true);
return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null;
}
if ($httpCode == 429) {
sleep(5);
$retryCount++;
continue;
}
$retryCount++;
sleep(1);
}
return null;
}
/**
* 提取标题
*/
public function getTitle($aDoiInfo = [])
{
return $aDoiInfo['title'][0] ?? '';
}
/**
* 提取期刊/出版社相关信息
*/
public function getPublisher($aDoiInfo = [])
{
return [
'title' => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '',
'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '',
'ISSN' => $aDoiInfo['ISSN'] ?? [],
'publisher' => $aDoiInfo['publisher'] ?? '',
];
}
/**
* 提取作者列表
*/
public function getAuthors($aDoiInfo = [])
{
$authors = [];
if (!empty($aDoiInfo['author'])) {
foreach ($aDoiInfo['author'] as $author) {
$name = $author['family'] ?? '';
if (!empty($author['given'])) {
$name = $author['given'] . ' ' . $name;
}
if (!empty($name)) {
$authors[] = $name;
}
}
}
return $authors;
}
/**
* 提取发表年份
*/
public function getPublishYear($aDoiInfo = [])
{
if (!empty($aDoiInfo['issued']['date-parts'][0][0])) {
return (string)$aDoiInfo['issued']['date-parts'][0][0];
}
return '';
}
/**
* 提取卷(期):起始页-终止页格式2024:10(2):100-120
*/
public function getVolumeIssuePages($aDoiInfo = [])
{
$parts = [];
$year = $this->getPublishYear($aDoiInfo);
if ($year) $parts[] = $year;
$volume = $aDoiInfo['volume'] ?? '';
$issue = $aDoiInfo['issue'] ?? '';
if ($volume) {
$parts[] = $volume . ($issue ? "({$issue})" : '');
}
$pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? '');
$pageEnd = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? '');
$pages = '';
if ($pageStart) {
$pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : '');
} else {
$pages = $aDoiInfo['page'] ?? '';
}
if ($pages) $parts[] = $pages;
return implode(':', $parts);
}
/**
* 识别撤稿文章(与 Crossrefdoi.php 同逻辑)
*/
public function checkRetracted($aDoiInfo = [])
{
$isRetracted = false;
$reason = "未撤稿";
$sType = strtolower($aDoiInfo['type'] ?? '');
$sSubtype = strtolower($aDoiInfo['subtype'] ?? '');
if ($sType && in_array($sType, ['retraction', 'correction'])) {
$isRetracted = true;
$reason = "文章类型为{$sType}(撤稿/更正声明)";
}
if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) {
$isRetracted = true;
$reason = "文章类型为{$sSubtype}(撤稿/更正声明)";
}
if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) {
$isRetracted = true;
$reason = "官方标记为撤稿update-type: retraction";
}
if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) {
foreach ($aDoiInfo['relation'] as $relType => $relItems) {
if (in_array($relType, ['is-retraction-of', 'corrects'])) {
$isRetracted = true;
$relatedDoi = $relItems[0]['id'] ?? '未知';
$reason = "关联撤稿文章{$relatedDoi}(关系:{$relType}";
break;
}
}
}
if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) {
foreach ($aDoiInfo['update-to'] as $update) {
$updateType = strtolower($update['type'] ?? '');
$updateLabel = strtolower($update['label'] ?? '');
if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
$isRetracted = true;
$reason = "update-to 标记撤稿({$updateType}/{$updateLabel}";
break;
}
}
}
$aTitles = $aDoiInfo['title'] ?? [];
foreach ($aTitles as $value) {
$sTitleLower = strtolower($value);
if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false
|| strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) {
$isRetracted = true;
$reason = "标题包含撤稿关键词";
break;
}
}
return [
'is_retracted' => $isRetracted,
'reason' => $reason,
];
}
/**
* 识别 doi 链接
*/
public function getDolink($aDoiInfo = [])
{
return $aDoiInfo['URL'] ?? '';
}
/**
* 解析 Crossref date-parts
*/
public function parseDateParts($dateObj)
{
$parts = $dateObj['date-parts'][0] ?? [];
if (empty($parts)) return '';
$y = $parts[0] ?? '';
$m = $parts[1] ?? '';
$d = $parts[2] ?? '';
$out = (string)$y;
if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT);
if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT);
return $out;
}
/**
* 引用质检:判断(1) 被引条目是否疑似退稿/更正(基于 Crossref)(2) 引用上下文是否与被引条目相关(基于证据命中)。
*
* 说明:
* - 适用于没有 abstract/keywords 的场景(仅用 title/author/journal/year + 引用上下文句子)。
* - 如果 refer_doi/doilink 为空,则 problem_flag 只能返回 unknown。
*
* @param string $contextText 引用处的上下文句子(英文,最好只包含引用所在句 + 少量相邻句)
* @param array $refMeta 被引条目元信息(建议来自 production_article_refer
* - refer_doi / doilink / title / author / joura / dateno
* @param array $options 可选参数
* - check_retraction(bool) 是否调用 Crossref 判断退稿/更正;默认 true
* - background_phrases(array) 背景堆引用触发短语;默认使用内置
*
* @return array
* [
* 'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown',
* 'problem_reason' => string,
* 'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated',
* 'relevance_score' => float,
* 'reason' => string
* ]
*/
public function qcCitation(string $contextText, array $refMeta, array $options = []): array
{
$contextText = trim($contextText);
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
$refTitle = (string)($refMeta['title'] ?? '');
$refAuthor = (string)($refMeta['author'] ?? '');
$refJoura = (string)($refMeta['joura'] ?? '');
$refDateno = (string)($refMeta['dateno'] ?? '');
$referDoi = (string)($refMeta['refer_doi'] ?? '');
$doilink = (string)($refMeta['doilink'] ?? '');
$doi = $this->extractDoiFromMeta($referDoi, $doilink);
// 1) 退稿/更正判断(强规则,影响 problem_flag
$problemFlag = 'unknown';
$problemReason = '';
if ($checkRetraction) {
if (!empty($doi)) {
$summary = $this->fetchWorkSummary($doi);
if ($summary && isset($summary['is_retracted'])) {
if ((int)$summary['is_retracted'] === 1) {
$problemFlag = 'retracted_or_corrected';
$problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction';
} else {
$problemFlag = 'ok';
$problemReason = 'Crossref indicates not retracted/corrected';
}
} else {
$problemFlag = 'unknown';
$problemReason = 'Crossref fetch failed or returned unexpected data';
}
} else {
$problemFlag = 'unknown';
$problemReason = 'DOI is empty';
}
} else {
$problemFlag = 'unknown';
$problemReason = 'Skip retraction check';
}
// 2) 相关性判断(弱规则+证据命中)
$backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [
'several studies',
'many studies',
'the literature',
'the existing literature',
'has been reported',
'have been reported',
'it has been shown',
'previous studies',
'the study suggests',
'the literature suggests',
'in the literature',
];
$ctxLower = strtolower($contextText);
$isBackground = false;
foreach ($backgroundPhrases as $ph) {
$ph = strtolower(trim((string)$ph));
if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) {
$isBackground = true;
break;
}
}
$refTokens = $this->buildEvidenceTokens([
'title' => $refTitle,
'author' => $refAuthor,
'journal' => $refJoura,
'year' => $refDateno,
]);
$ctxTokens = $this->tokenize($contextText);
$titleOverlap = 0.0;
$authorHit = 0.0;
$journalOverlap = 0.0;
$yearHit = 0.0;
$titleTokens = $refTokens['titleTokens'] ?? [];
$authorTokens = $refTokens['authorTokens'] ?? [];
$journalTokens = $refTokens['journalTokens'] ?? [];
$yearToken = $refTokens['yearToken'] ?? '';
if (!empty($titleTokens)) {
$inter = array_intersect($titleTokens, $ctxTokens);
$titleOverlap = count($inter) / max(1, count($titleTokens));
}
if (!empty($authorTokens)) {
foreach ($authorTokens as $at) {
if ($at !== '' && in_array($at, $ctxTokens, true)) {
$authorHit = 1.0;
break;
}
}
}
if (!empty($journalTokens)) {
$interJ = array_intersect($journalTokens, $ctxTokens);
$journalOverlap = count($interJ) / max(1, count($journalTokens));
}
if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) {
$yearHit = 1.0;
}
// 综合得分(保持解释性:越高越相关)
$score = round((
0.60 * $titleOverlap +
0.20 * $authorHit +
0.15 * $yearHit +
0.05 * $journalOverlap
), 4);
$relevanceFlag = 'unsure';
$reasonParts = [];
if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) {
$relevanceFlag = 'related';
$reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap;
} elseif ($score >= 0.25) {
$relevanceFlag = 'unsure';
$reasonParts[] = 'evidence_score_mid=' . $score;
} else {
if ($isBackground) {
$relevanceFlag = 'unsure_background';
$reasonParts[] = 'background_phrases_detected';
} else {
$relevanceFlag = 'suspicious_unrelated';
$reasonParts[] = 'evidence_score_low=' . $score;
}
}
$reasonParts[] = 'titleOverlap=' . $titleOverlap;
$reasonParts[] = 'authorHit=' . $authorHit;
$reasonParts[] = 'yearHit=' . $yearHit;
$reasonParts[] = 'journalOverlap=' . $journalOverlap;
$reason = implode('; ', $reasonParts);
return [
'problem_flag' => $problemFlag,
'problem_reason' => $problemReason,
'relevance_flag' => $relevanceFlag,
'relevance_score' => (float)$score,
'reason' => $reason,
];
}
/**
* 从 refer_doi / doilink 中抽取 DOI 字符串。
* @param string $referDoi
* @param string $doilink
* @return string
*/
private function extractDoiFromMeta(string $referDoi, string $doilink): string
{
$doi = trim($referDoi);
if (!empty($doi)) {
return $this->filterValidDoi($doi);
}
$link = trim($doilink);
if ($link === '') return '';
// 常见https://doi.org/10.xxxx/xxxx 或 http://doi.org/...
if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) {
$candidate = trim((string)$m[1]);
return $this->filterValidDoi($candidate);
}
// 兜底如果doilink本身就是doi格式
return $this->filterValidDoi($link);
}
/**
* 构建证据 token用于证据命中/相似度粗判)
* @param array $src
* @return array
*/
private function buildEvidenceTokens(array $src): array
{
$stop = [
'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are',
'was','were','be','been','being','that','this','these','those','which','who','whom','it','its',
'we','our','us','they','their','them','i','you','your','he','she','his','her',
'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods',
'results','result','using','used','show','shown','demonstrated','demonstrate',
];
$titleTokens = $this->tokenize((string)($src['title'] ?? ''));
$titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) {
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
}));
$authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? ''));
$authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) {
$t = trim($t);
if ($t === '') return '';
if (in_array($t, $stop, true)) return '';
return $t;
}, $authorTokens))));
$journalTokens = $this->tokenize((string)($src['journal'] ?? ''));
$journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) {
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
}));
$yearToken = '';
$yearRaw = (string)($src['year'] ?? '');
if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) {
$yearToken = (string)$m[1];
}
return [
'titleTokens' => $titleTokens,
'authorTokens' => $authorTokens,
'journalTokens' => $journalTokens,
'yearToken' => $yearToken,
];
}
/**
* 提取作者姓/缩写 token简化版
* @param string $authorStr
* @return array
*/
private function extractAuthorTokens(string $authorStr): array
{
$authorStr = trim($authorStr);
if ($authorStr === '') return [];
// 把常见分隔符拆开
$parts = preg_split('/[,;]| and /i', $authorStr);
$tokens = [];
foreach ($parts as $p) {
$p = trim((string)$p);
if ($p === '') continue;
// 取最后一个词当作姓(例如 "Smith J" -> "Smith"),或取首段词
$words = preg_split('/\s+/', $p);
if (empty($words)) continue;
$cand = trim((string)end($words));
if ($cand === '') $cand = trim((string)($words[0] ?? ''));
// 只保留字母/点号(去掉异常符号)
$cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand);
$cand = strtolower($cand);
if ($cand !== '') {
$tokens[] = $cand;
}
}
// 去掉过短的 token
$tokens = array_values(array_filter(array_unique($tokens), function ($t) {
return mb_strlen($t) >= 4;
}));
return $tokens;
}
/**
* 文本 tokenize英文下的轻量分词
* @param string $text
* @return array
*/
private function tokenize(string $text): array
{
$text = strtolower(trim($text));
if ($text === '') return [];
$parts = preg_split('/[^a-z0-9]+/i', $text);
$tokens = [];
foreach ($parts as $p) {
$p = trim((string)$p);
if ($p === '') continue;
// 保留较有信息量的 token
if (mb_strlen($p) < 3) continue;
$tokens[] = $p;
}
return array_values(array_unique($tokens));
}
/**
* 英文切句(轻量实现):按 .?! 分割,同时保留句内的 [n]。
* @param string $text
* @return array
*/
private function splitEnglishSentences(string $text): array
{
$text = trim($text);
if ($text === '') return [];
// 先把换行统一为空格,避免断句被打断
$text = preg_replace('/\s+/u', ' ', $text);
// 按句末标点断句:. ? ! 后面跟空格/结尾
$parts = preg_split('/(?<=[\.\?\!])\s+/', $text);
$sentences = [];
foreach ($parts as $p) {
$p = trim((string)$p);
if ($p === '') continue;
$sentences[] = $p;
}
return $sentences;
}
}