自动推广

This commit is contained in:
wangjinlei
2026-04-03 11:45:45 +08:00
parent 22947a56a4
commit a802b2e923
11 changed files with 2240 additions and 36 deletions

View File

@@ -0,0 +1,765 @@
<?php
namespace app\common;
/**
* Crossref API 工具类
*
* 说明:
* - 仿照 application/api/controller/Crossrefdoi.php 的实现风格抽成 Service
* - 仅做「请求 + 解析」;不包含任何数据库读写
*/
class CrossrefService
{
// 配置项
private $mailto = ''; // 邮箱(提升优先级)
private $timeout = 15; // 请求超时(秒)
private $maxRetry = 2; // 单个DOI最大重试次数
private $crossrefUrl = "https://api.crossref.org/works/"; // 接口地址
public function __construct($config = [])
{
if (is_array($config)) {
if (isset($config['mailto'])) $this->mailto = (string)$config['mailto'];
if (isset($config['timeout'])) $this->timeout = intval($config['timeout']);
if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']);
if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl'];
}
}
public function setMailto($mailto)
{
$this->mailto = (string)$mailto;
return $this;
}
/**
* 引用标号转换工具:正文里的 [n] 对应 production_article_refer.index = n-1index 从 0 开始)。
*
* @param int $citationMark 正文引用编号,如 13来自 [13]
* @return int production_article_refer.index如 12
*/
public function referIndexFromCitationMark(int $citationMark): int
{
$citationMark = intval($citationMark);
return max(0, $citationMark - 1);
}
/**
* 反向转换工具production_article_refer.index从 0 开始)→ 正文引用编号 [n]。
*
* @param int $referIndex production_article_refer.index如 12
* @return int 正文引用编号 n如 13
*/
public function citationMarkFromReferIndex(int $referIndex): int
{
$referIndex = intval($referIndex);
return max(0, $referIndex + 1);
}
/**
* 批量引用质检(不查库版):\n
* - 输入文章分节内容t_article_main 的 content 列表)\n
* - 输入引用条目production_article_refer 的行列表)\n
* - 自动抽取每个 [n] 的英文句子上下文,并映射到 refer.index=n-1 后调用 qcCitation()\n
*
* 说明:本方法不做任何数据库查询,方便你在 controller/service 中自由组合数据来源。
*
* @param array $articleMainContents 文章内容片段数组(按 sort 顺序),元素为 string 或含 content 的数组
* @param array $referRows production_article_refer 行数组(至少含 index/title/author/joura/dateno/refer_doi/doilink
* @param array $options 透传给 qcCitation 的 options并支持
* - sentence_window(int) 上下文句子窗口,默认 1即前1句+本句+后1句
* @return array 结果列表,每条包含 citation_mark/refer_index/context/ref_meta/qc
*/
public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array
{
$window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1;
// 1) 组装全文纯文本(保留 [n]
$chunks = [];
foreach ($articleMainContents as $row) {
if (is_array($row)) {
$text = (string)($row['content'] ?? '');
} else {
$text = (string)$row;
}
if ($text === '') continue;
// 去掉常见标签,保留 [n]
$text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text);
$text = strip_tags($text);
$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = preg_replace('/\s+/u', ' ', trim($text));
if ($text !== '') $chunks[] = $text;
}
$fullText = implode("\n", $chunks);
if ($fullText === '') return [];
// 2) 构建引用条目映射refer_index => row
$referMap = [];
foreach ($referRows as $r) {
if (!is_array($r)) continue;
if (!isset($r['index'])) continue;
$idx = intval($r['index']);
$referMap[$idx] = $r;
}
// 3) 英文切句(简单稳健版)
$sentences = $this->splitEnglishSentences($fullText);
if (empty($sentences)) return [];
// 4) 遍历句子,抓取其中的 [n]
$results = [];
foreach ($sentences as $si => $sent) {
if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) {
continue;
}
$marks = array_unique(array_map('intval', $m[1]));
foreach ($marks as $citationMark) {
if ($citationMark <= 0) continue;
$referIndex = $this->referIndexFromCitationMark($citationMark);
if (!isset($referMap[$referIndex])) {
continue;
}
$start = max(0, $si - $window);
$end = min(count($sentences) - 1, $si + $window);
$ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1));
$ctx = trim(preg_replace('/\s+/u', ' ', $ctx));
$refMeta = $referMap[$referIndex];
$qc = $this->qcCitation($ctx, $refMeta, $options);
$results[] = [
'citation_mark' => $citationMark, // 正文编号 n来自 [n]
'refer_index' => $referIndex, // production_article_refer.index
'context' => $ctx,
'ref_meta' => [
'p_refer_id' => $refMeta['p_refer_id'] ?? 0,
'title' => $refMeta['title'] ?? '',
'author' => $refMeta['author'] ?? '',
'joura' => $refMeta['joura'] ?? '',
'dateno' => $refMeta['dateno'] ?? '',
'refer_doi' => $refMeta['refer_doi'] ?? '',
'doilink' => $refMeta['doilink'] ?? '',
'index' => $refMeta['index'] ?? $referIndex,
],
'qc' => $qc,
];
}
}
return $results;
}
/**
* 过滤非法DOI仅保留10.xxxx/xxx格式
* @param string $doi
* @return string
*/
public function filterValidDoi($doi = '')
{
$doi = trim((string)$doi);
if ($doi === '') return '';
if (preg_match('/^10\.\d{4,}\/.+/', $doi)) {
return $doi;
}
return '';
}
/**
* 获取 Crossref message带重试
* @param string $doi
* @return array|null
*/
public function fetchWork($doi)
{
$doi = $this->filterValidDoi($doi);
if ($doi === '') return null;
return $this->fetchSingleDoiWithRetry($doi);
}
/**
* 返回常用字段集合(标题/期刊/作者/卷期页/撤稿/URL
* @param string $doi
* @return array|null
*/
public function fetchWorkSummary($doi)
{
$msg = $this->fetchWork($doi);
if (!$msg) return null;
$title = $this->getTitle($msg);
$publisher = $this->getPublisher($msg);
$joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? '');
$authors = $this->getAuthors($msg);
$dateno = $this->getVolumeIssuePages($msg);
$retractInfo = $this->checkRetracted($msg);
$dolink = $this->getDolink($msg);
if (empty($dolink)) {
$dolink = 'https://doi.org/' . $this->filterValidDoi($doi);
}
return [
'doi' => $this->filterValidDoi($doi),
'title' => $title,
'joura' => $joura,
'publisher' => $publisher,
'authors' => $authors,
'author_str' => empty($authors) ? '' : implode(',', $authors),
'dateno' => $dateno,
'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0,
'retract_reason' => $retractInfo['reason'] ?? '',
'doilink' => $dolink,
'raw' => $msg,
];
}
/**
* 单DOI查询带重试
* @param string $doi
* @return array|null
*/
private function fetchSingleDoiWithRetry($doi)
{
$retryCount = 0;
while ($retryCount < $this->maxRetry) {
$url = $this->crossrefUrl . rawurlencode($doi);
if (!empty($this->mailto)) {
$url .= "?mailto=" . rawurlencode($this->mailto);
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
"User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})"
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode == 200) {
$data = json_decode($response, true);
return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null;
}
if ($httpCode == 429) {
sleep(5);
$retryCount++;
continue;
}
$retryCount++;
sleep(1);
}
return null;
}
/**
* 提取标题
*/
public function getTitle($aDoiInfo = [])
{
return $aDoiInfo['title'][0] ?? '';
}
/**
* 提取期刊/出版社相关信息
*/
public function getPublisher($aDoiInfo = [])
{
return [
'title' => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '',
'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '',
'ISSN' => $aDoiInfo['ISSN'] ?? [],
'publisher' => $aDoiInfo['publisher'] ?? '',
];
}
/**
* 提取作者列表
*/
public function getAuthors($aDoiInfo = [])
{
$authors = [];
if (!empty($aDoiInfo['author'])) {
foreach ($aDoiInfo['author'] as $author) {
$name = $author['family'] ?? '';
if (!empty($author['given'])) {
$name = $author['given'] . ' ' . $name;
}
if (!empty($name)) {
$authors[] = $name;
}
}
}
return $authors;
}
/**
* 提取发表年份
*/
public function getPublishYear($aDoiInfo = [])
{
if (!empty($aDoiInfo['issued']['date-parts'][0][0])) {
return (string)$aDoiInfo['issued']['date-parts'][0][0];
}
return '';
}
/**
* 提取卷(期):起始页-终止页格式2024:10(2):100-120
*/
public function getVolumeIssuePages($aDoiInfo = [])
{
$parts = [];
$year = $this->getPublishYear($aDoiInfo);
if ($year) $parts[] = $year;
$volume = $aDoiInfo['volume'] ?? '';
$issue = $aDoiInfo['issue'] ?? '';
if ($volume) {
$parts[] = $volume . ($issue ? "({$issue})" : '');
}
$pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? '');
$pageEnd = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? '');
$pages = '';
if ($pageStart) {
$pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : '');
} else {
$pages = $aDoiInfo['page'] ?? '';
}
if ($pages) $parts[] = $pages;
return implode(':', $parts);
}
/**
* 识别撤稿文章(与 Crossrefdoi.php 同逻辑)
*/
public function checkRetracted($aDoiInfo = [])
{
$isRetracted = false;
$reason = "未撤稿";
$sType = strtolower($aDoiInfo['type'] ?? '');
$sSubtype = strtolower($aDoiInfo['subtype'] ?? '');
if ($sType && in_array($sType, ['retraction', 'correction'])) {
$isRetracted = true;
$reason = "文章类型为{$sType}(撤稿/更正声明)";
}
if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) {
$isRetracted = true;
$reason = "文章类型为{$sSubtype}(撤稿/更正声明)";
}
if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) {
$isRetracted = true;
$reason = "官方标记为撤稿update-type: retraction";
}
if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) {
foreach ($aDoiInfo['relation'] as $relType => $relItems) {
if (in_array($relType, ['is-retraction-of', 'corrects'])) {
$isRetracted = true;
$relatedDoi = $relItems[0]['id'] ?? '未知';
$reason = "关联撤稿文章{$relatedDoi}(关系:{$relType}";
break;
}
}
}
if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) {
foreach ($aDoiInfo['update-to'] as $update) {
$updateType = strtolower($update['type'] ?? '');
$updateLabel = strtolower($update['label'] ?? '');
if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
$isRetracted = true;
$reason = "update-to 标记撤稿({$updateType}/{$updateLabel}";
break;
}
}
}
$aTitles = $aDoiInfo['title'] ?? [];
foreach ($aTitles as $value) {
$sTitleLower = strtolower($value);
if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false
|| strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) {
$isRetracted = true;
$reason = "标题包含撤稿关键词";
break;
}
}
return [
'is_retracted' => $isRetracted,
'reason' => $reason,
];
}
/**
* 识别 doi 链接
*/
public function getDolink($aDoiInfo = [])
{
return $aDoiInfo['URL'] ?? '';
}
/**
* 解析 Crossref date-parts
*/
public function parseDateParts($dateObj)
{
$parts = $dateObj['date-parts'][0] ?? [];
if (empty($parts)) return '';
$y = $parts[0] ?? '';
$m = $parts[1] ?? '';
$d = $parts[2] ?? '';
$out = (string)$y;
if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT);
if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT);
return $out;
}
/**
* 引用质检:判断(1) 被引条目是否疑似退稿/更正(基于 Crossref)(2) 引用上下文是否与被引条目相关(基于证据命中)。
*
* 说明:
* - 适用于没有 abstract/keywords 的场景(仅用 title/author/journal/year + 引用上下文句子)。
* - 如果 refer_doi/doilink 为空,则 problem_flag 只能返回 unknown。
*
* @param string $contextText 引用处的上下文句子(英文,最好只包含引用所在句 + 少量相邻句)
* @param array $refMeta 被引条目元信息(建议来自 production_article_refer
* - refer_doi / doilink / title / author / joura / dateno
* @param array $options 可选参数
* - check_retraction(bool) 是否调用 Crossref 判断退稿/更正;默认 true
* - background_phrases(array) 背景堆引用触发短语;默认使用内置
*
* @return array
* [
* 'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown',
* 'problem_reason' => string,
* 'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated',
* 'relevance_score' => float,
* 'reason' => string
* ]
*/
public function qcCitation(string $contextText, array $refMeta, array $options = []): array
{
$contextText = trim($contextText);
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
$refTitle = (string)($refMeta['title'] ?? '');
$refAuthor = (string)($refMeta['author'] ?? '');
$refJoura = (string)($refMeta['joura'] ?? '');
$refDateno = (string)($refMeta['dateno'] ?? '');
$referDoi = (string)($refMeta['refer_doi'] ?? '');
$doilink = (string)($refMeta['doilink'] ?? '');
$doi = $this->extractDoiFromMeta($referDoi, $doilink);
// 1) 退稿/更正判断(强规则,影响 problem_flag
$problemFlag = 'unknown';
$problemReason = '';
if ($checkRetraction) {
if (!empty($doi)) {
$summary = $this->fetchWorkSummary($doi);
if ($summary && isset($summary['is_retracted'])) {
if ((int)$summary['is_retracted'] === 1) {
$problemFlag = 'retracted_or_corrected';
$problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction';
} else {
$problemFlag = 'ok';
$problemReason = 'Crossref indicates not retracted/corrected';
}
} else {
$problemFlag = 'unknown';
$problemReason = 'Crossref fetch failed or returned unexpected data';
}
} else {
$problemFlag = 'unknown';
$problemReason = 'DOI is empty';
}
} else {
$problemFlag = 'unknown';
$problemReason = 'Skip retraction check';
}
// 2) 相关性判断(弱规则+证据命中)
$backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [
'several studies',
'many studies',
'the literature',
'the existing literature',
'has been reported',
'have been reported',
'it has been shown',
'previous studies',
'the study suggests',
'the literature suggests',
'in the literature',
];
$ctxLower = strtolower($contextText);
$isBackground = false;
foreach ($backgroundPhrases as $ph) {
$ph = strtolower(trim((string)$ph));
if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) {
$isBackground = true;
break;
}
}
$refTokens = $this->buildEvidenceTokens([
'title' => $refTitle,
'author' => $refAuthor,
'journal' => $refJoura,
'year' => $refDateno,
]);
$ctxTokens = $this->tokenize($contextText);
$titleOverlap = 0.0;
$authorHit = 0.0;
$journalOverlap = 0.0;
$yearHit = 0.0;
$titleTokens = $refTokens['titleTokens'] ?? [];
$authorTokens = $refTokens['authorTokens'] ?? [];
$journalTokens = $refTokens['journalTokens'] ?? [];
$yearToken = $refTokens['yearToken'] ?? '';
if (!empty($titleTokens)) {
$inter = array_intersect($titleTokens, $ctxTokens);
$titleOverlap = count($inter) / max(1, count($titleTokens));
}
if (!empty($authorTokens)) {
foreach ($authorTokens as $at) {
if ($at !== '' && in_array($at, $ctxTokens, true)) {
$authorHit = 1.0;
break;
}
}
}
if (!empty($journalTokens)) {
$interJ = array_intersect($journalTokens, $ctxTokens);
$journalOverlap = count($interJ) / max(1, count($journalTokens));
}
if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) {
$yearHit = 1.0;
}
// 综合得分(保持解释性:越高越相关)
$score = round((
0.60 * $titleOverlap +
0.20 * $authorHit +
0.15 * $yearHit +
0.05 * $journalOverlap
), 4);
$relevanceFlag = 'unsure';
$reasonParts = [];
if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) {
$relevanceFlag = 'related';
$reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap;
} elseif ($score >= 0.25) {
$relevanceFlag = 'unsure';
$reasonParts[] = 'evidence_score_mid=' . $score;
} else {
if ($isBackground) {
$relevanceFlag = 'unsure_background';
$reasonParts[] = 'background_phrases_detected';
} else {
$relevanceFlag = 'suspicious_unrelated';
$reasonParts[] = 'evidence_score_low=' . $score;
}
}
$reasonParts[] = 'titleOverlap=' . $titleOverlap;
$reasonParts[] = 'authorHit=' . $authorHit;
$reasonParts[] = 'yearHit=' . $yearHit;
$reasonParts[] = 'journalOverlap=' . $journalOverlap;
$reason = implode('; ', $reasonParts);
return [
'problem_flag' => $problemFlag,
'problem_reason' => $problemReason,
'relevance_flag' => $relevanceFlag,
'relevance_score' => (float)$score,
'reason' => $reason,
];
}
/**
* 从 refer_doi / doilink 中抽取 DOI 字符串。
* @param string $referDoi
* @param string $doilink
* @return string
*/
private function extractDoiFromMeta(string $referDoi, string $doilink): string
{
$doi = trim($referDoi);
if (!empty($doi)) {
return $this->filterValidDoi($doi);
}
$link = trim($doilink);
if ($link === '') return '';
// 常见https://doi.org/10.xxxx/xxxx 或 http://doi.org/...
if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) {
$candidate = trim((string)$m[1]);
return $this->filterValidDoi($candidate);
}
// 兜底如果doilink本身就是doi格式
return $this->filterValidDoi($link);
}
/**
* 构建证据 token用于证据命中/相似度粗判)
* @param array $src
* @return array
*/
private function buildEvidenceTokens(array $src): array
{
$stop = [
'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are',
'was','were','be','been','being','that','this','these','those','which','who','whom','it','its',
'we','our','us','they','their','them','i','you','your','he','she','his','her',
'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods',
'results','result','using','used','show','shown','demonstrated','demonstrate',
];
$titleTokens = $this->tokenize((string)($src['title'] ?? ''));
$titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) {
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
}));
$authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? ''));
$authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) {
$t = trim($t);
if ($t === '') return '';
if (in_array($t, $stop, true)) return '';
return $t;
}, $authorTokens))));
$journalTokens = $this->tokenize((string)($src['journal'] ?? ''));
$journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) {
return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
}));
$yearToken = '';
$yearRaw = (string)($src['year'] ?? '');
if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) {
$yearToken = (string)$m[1];
}
return [
'titleTokens' => $titleTokens,
'authorTokens' => $authorTokens,
'journalTokens' => $journalTokens,
'yearToken' => $yearToken,
];
}
/**
* 提取作者姓/缩写 token简化版
* @param string $authorStr
* @return array
*/
private function extractAuthorTokens(string $authorStr): array
{
$authorStr = trim($authorStr);
if ($authorStr === '') return [];
// 把常见分隔符拆开
$parts = preg_split('/[,;]| and /i', $authorStr);
$tokens = [];
foreach ($parts as $p) {
$p = trim((string)$p);
if ($p === '') continue;
// 取最后一个词当作姓(例如 "Smith J" -> "Smith"),或取首段词
$words = preg_split('/\s+/', $p);
if (empty($words)) continue;
$cand = trim((string)end($words));
if ($cand === '') $cand = trim((string)($words[0] ?? ''));
// 只保留字母/点号(去掉异常符号)
$cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand);
$cand = strtolower($cand);
if ($cand !== '') {
$tokens[] = $cand;
}
}
// 去掉过短的 token
$tokens = array_values(array_filter(array_unique($tokens), function ($t) {
return mb_strlen($t) >= 4;
}));
return $tokens;
}
/**
* 文本 tokenize英文下的轻量分词
* @param string $text
* @return array
*/
private function tokenize(string $text): array
{
$text = strtolower(trim($text));
if ($text === '') return [];
$parts = preg_split('/[^a-z0-9]+/i', $text);
$tokens = [];
foreach ($parts as $p) {
$p = trim((string)$p);
if ($p === '') continue;
// 保留较有信息量的 token
if (mb_strlen($p) < 3) continue;
$tokens[] = $p;
}
return array_values(array_unique($tokens));
}
/**
* 英文切句(轻量实现):按 .?! 分割,同时保留句内的 [n]。
* @param string $text
* @return array
*/
private function splitEnglishSentences(string $text): array
{
$text = trim($text);
if ($text === '') return [];
// 先把换行统一为空格,避免断句被打断
$text = preg_replace('/\s+/u', ' ', $text);
// 按句末标点断句:. ? ! 后面跟空格/结尾
$parts = preg_split('/(?<=[\.\?\!])\s+/', $text);
$sentences = [];
foreach ($parts as $p) {
$p = trim((string)$p);
if ($p === '') continue;
$sentences[] = $p;
}
return $sentences;
}
}