mailto = (string)$config['mailto']; if (isset($config['timeout'])) $this->timeout = intval($config['timeout']); if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']); if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl']; } } public function setMailto($mailto) { $this->mailto = (string)$mailto; return $this; } /** * 引用标号转换工具:正文里的 [n] 对应 production_article_refer.index = n-1(index 从 0 开始)。 * * @param int $citationMark 正文引用编号,如 13(来自 [13]) * @return int production_article_refer.index,如 12 */ public function referIndexFromCitationMark(int $citationMark): int { $citationMark = intval($citationMark); return max(0, $citationMark - 1); } /** * 反向转换工具:production_article_refer.index(从 0 开始)→ 正文引用编号 [n]。 * * @param int $referIndex production_article_refer.index,如 12 * @return int 正文引用编号 n,如 13 */ public function citationMarkFromReferIndex(int $referIndex): int { $referIndex = intval($referIndex); return max(0, $referIndex + 1); } /** * 批量引用质检(不查库版):\n * - 输入文章分节内容(t_article_main 的 content 列表)\n * - 输入引用条目(production_article_refer 的行列表)\n * - 自动抽取每个 [n] 的英文句子上下文,并映射到 refer.index=n-1 后调用 qcCitation()\n * * 说明:本方法不做任何数据库查询,方便你在 controller/service 中自由组合数据来源。 * * @param array $articleMainContents 文章内容片段数组(按 sort 顺序),元素为 string 或含 content 的数组 * @param array $referRows production_article_refer 行数组(至少含 index/title/author/joura/dateno/refer_doi/doilink) * @param array $options 透传给 qcCitation 的 options,并支持: * - sentence_window(int) 上下文句子窗口,默认 1(即前1句+本句+后1句) * @return array 结果列表,每条包含 citation_mark/refer_index/context/ref_meta/qc */ public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array { $window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1; // 1) 组装全文纯文本(保留 [n]) $chunks = []; foreach ($articleMainContents as $row) { if (is_array($row)) { $text = (string)($row['content'] ?? ''); } else { $text = (string)$row; } if ($text === '') continue; // 去掉常见标签,保留 [n] $text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text); $text = strip_tags($text); $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = preg_replace('/\s+/u', ' ', trim($text)); if ($text !== '') $chunks[] = $text; } $fullText = implode("\n", $chunks); if ($fullText === '') return []; // 2) 构建引用条目映射:refer_index => row $referMap = []; foreach ($referRows as $r) { if (!is_array($r)) continue; if (!isset($r['index'])) continue; $idx = intval($r['index']); $referMap[$idx] = $r; } // 3) 英文切句(简单稳健版) $sentences = $this->splitEnglishSentences($fullText); if (empty($sentences)) return []; // 4) 遍历句子,抓取其中的 [n] $results = []; foreach ($sentences as $si => $sent) { if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) { continue; } $marks = array_unique(array_map('intval', $m[1])); foreach ($marks as $citationMark) { if ($citationMark <= 0) continue; $referIndex = $this->referIndexFromCitationMark($citationMark); if (!isset($referMap[$referIndex])) { continue; } $start = max(0, $si - $window); $end = min(count($sentences) - 1, $si + $window); $ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1)); $ctx = trim(preg_replace('/\s+/u', ' ', $ctx)); $refMeta = $referMap[$referIndex]; $qc = $this->qcCitation($ctx, $refMeta, $options); $results[] = [ 'citation_mark' => $citationMark, // 正文编号 n(来自 [n]) 'refer_index' => $referIndex, // production_article_refer.index 'context' => $ctx, 'ref_meta' => [ 'p_refer_id' => $refMeta['p_refer_id'] ?? 0, 'title' => $refMeta['title'] ?? '', 'author' => $refMeta['author'] ?? '', 'joura' => $refMeta['joura'] ?? '', 'dateno' => $refMeta['dateno'] ?? '', 'refer_doi' => $refMeta['refer_doi'] ?? '', 'doilink' => $refMeta['doilink'] ?? '', 'index' => $refMeta['index'] ?? $referIndex, ], 'qc' => $qc, ]; } } return $results; } /** * 过滤非法DOI(仅保留10.xxxx/xxx格式) * @param string $doi * @return string */ public function filterValidDoi($doi = '') { $doi = trim((string)$doi); if ($doi === '') return ''; if (preg_match('/^10\.\d{4,}\/.+/', $doi)) { return $doi; } return ''; } /** * 获取 Crossref message(带重试) * @param string $doi * @return array|null */ public function fetchWork($doi) { $doi = $this->filterValidDoi($doi); if ($doi === '') return null; return $this->fetchSingleDoiWithRetry($doi); } /** * 返回常用字段集合(标题/期刊/作者/卷期页/撤稿/URL) * @param string $doi * @return array|null */ public function fetchWorkSummary($doi) { $msg = $this->fetchWork($doi); if (!$msg) return null; $title = $this->getTitle($msg); $publisher = $this->getPublisher($msg); $joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? ''); $authors = $this->getAuthors($msg); $dateno = $this->getVolumeIssuePages($msg); $retractInfo = $this->checkRetracted($msg); $dolink = $this->getDolink($msg); if (empty($dolink)) { $dolink = 'https://doi.org/' . $this->filterValidDoi($doi); } return [ 'doi' => $this->filterValidDoi($doi), 'title' => $title, 'joura' => $joura, 'publisher' => $publisher, 'authors' => $authors, 'author_str' => empty($authors) ? '' : implode(',', $authors), 'dateno' => $dateno, 'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0, 'retract_reason' => $retractInfo['reason'] ?? '', 'doilink' => $dolink, 'raw' => $msg, ]; } /** * 单DOI查询(带重试) * @param string $doi * @return array|null */ private function fetchSingleDoiWithRetry($doi) { $retryCount = 0; while ($retryCount < $this->maxRetry) { $url = $this->crossrefUrl . rawurlencode($doi); if (!empty($this->mailto)) { $url .= "?mailto=" . rawurlencode($this->mailto); } $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_HTTPHEADER, [ "User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})" ]); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($httpCode == 200) { $data = json_decode($response, true); return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null; } if ($httpCode == 429) { sleep(5); $retryCount++; continue; } $retryCount++; sleep(1); } return null; } /** * 提取标题 */ public function getTitle($aDoiInfo = []) { return $aDoiInfo['title'][0] ?? ''; } /** * 提取期刊/出版社相关信息 */ public function getPublisher($aDoiInfo = []) { return [ 'title' => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '', 'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '', 'ISSN' => $aDoiInfo['ISSN'] ?? [], 'publisher' => $aDoiInfo['publisher'] ?? '', ]; } /** * 提取作者列表 */ public function getAuthors($aDoiInfo = []) { $authors = []; if (!empty($aDoiInfo['author'])) { foreach ($aDoiInfo['author'] as $author) { $name = $author['family'] ?? ''; if (!empty($author['given'])) { $name = $author['given'] . ' ' . $name; } if (!empty($name)) { $authors[] = $name; } } } return $authors; } /** * 提取发表年份 */ public function getPublishYear($aDoiInfo = []) { if (!empty($aDoiInfo['issued']['date-parts'][0][0])) { return (string)$aDoiInfo['issued']['date-parts'][0][0]; } return ''; } /** * 提取卷(期):起始页-终止页(格式:2024:10(2):100-120) */ public function getVolumeIssuePages($aDoiInfo = []) { $parts = []; $year = $this->getPublishYear($aDoiInfo); if ($year) $parts[] = $year; $volume = $aDoiInfo['volume'] ?? ''; $issue = $aDoiInfo['issue'] ?? ''; if ($volume) { $parts[] = $volume . ($issue ? "({$issue})" : ''); } $pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? ''); $pageEnd = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? ''); $pages = ''; if ($pageStart) { $pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : ''); } else { $pages = $aDoiInfo['page'] ?? ''; } if ($pages) $parts[] = $pages; return implode(':', $parts); } /** * 识别撤稿文章(与 Crossrefdoi.php 同逻辑) */ public function checkRetracted($aDoiInfo = []) { $isRetracted = false; $reason = "未撤稿"; $sType = strtolower($aDoiInfo['type'] ?? ''); $sSubtype = strtolower($aDoiInfo['subtype'] ?? ''); if ($sType && in_array($sType, ['retraction', 'correction'])) { $isRetracted = true; $reason = "文章类型为{$sType}(撤稿/更正声明)"; } if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) { $isRetracted = true; $reason = "文章类型为{$sSubtype}(撤稿/更正声明)"; } if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) { $isRetracted = true; $reason = "官方标记为撤稿(update-type: retraction)"; } if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) { foreach ($aDoiInfo['relation'] as $relType => $relItems) { if (in_array($relType, ['is-retraction-of', 'corrects'])) { $isRetracted = true; $relatedDoi = $relItems[0]['id'] ?? '未知'; $reason = "关联撤稿文章{$relatedDoi}(关系:{$relType})"; break; } } } if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) { foreach ($aDoiInfo['update-to'] as $update) { $updateType = strtolower($update['type'] ?? ''); $updateLabel = strtolower($update['label'] ?? ''); if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) { $isRetracted = true; $reason = "update-to 标记撤稿({$updateType}/{$updateLabel})"; break; } } } $aTitles = $aDoiInfo['title'] ?? []; foreach ($aTitles as $value) { $sTitleLower = strtolower($value); if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false || strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) { $isRetracted = true; $reason = "标题包含撤稿关键词"; break; } } return [ 'is_retracted' => $isRetracted, 'reason' => $reason, ]; } /** * 识别 doi 链接 */ public function getDolink($aDoiInfo = []) { return $aDoiInfo['URL'] ?? ''; } /** * 解析 Crossref date-parts */ public function parseDateParts($dateObj) { $parts = $dateObj['date-parts'][0] ?? []; if (empty($parts)) return ''; $y = $parts[0] ?? ''; $m = $parts[1] ?? ''; $d = $parts[2] ?? ''; $out = (string)$y; if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT); if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT); return $out; } /** * 引用质检:判断(1) 被引条目是否疑似退稿/更正(基于 Crossref);(2) 引用上下文是否与被引条目相关(基于证据命中)。 * * 说明: * - 适用于没有 abstract/keywords 的场景(仅用 title/author/journal/year + 引用上下文句子)。 * - 如果 refer_doi/doilink 为空,则 problem_flag 只能返回 unknown。 * * @param string $contextText 引用处的上下文句子(英文,最好只包含引用所在句 + 少量相邻句) * @param array $refMeta 被引条目元信息(建议来自 production_article_refer) * - refer_doi / doilink / title / author / joura / dateno * @param array $options 可选参数 * - check_retraction(bool) 是否调用 Crossref 判断退稿/更正;默认 true * - background_phrases(array) 背景堆引用触发短语;默认使用内置 * * @return array * [ * 'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown', * 'problem_reason' => string, * 'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated', * 'relevance_score' => float, * 'reason' => string * ] */ public function qcCitation(string $contextText, array $refMeta, array $options = []): array { $contextText = trim($contextText); $checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true; $refTitle = (string)($refMeta['title'] ?? ''); $refAuthor = (string)($refMeta['author'] ?? ''); $refJoura = (string)($refMeta['joura'] ?? ''); $refDateno = (string)($refMeta['dateno'] ?? ''); $referDoi = (string)($refMeta['refer_doi'] ?? ''); $doilink = (string)($refMeta['doilink'] ?? ''); $doi = $this->extractDoiFromMeta($referDoi, $doilink); // 1) 退稿/更正判断(强规则,影响 problem_flag) $problemFlag = 'unknown'; $problemReason = ''; if ($checkRetraction) { if (!empty($doi)) { $summary = $this->fetchWorkSummary($doi); if ($summary && isset($summary['is_retracted'])) { if ((int)$summary['is_retracted'] === 1) { $problemFlag = 'retracted_or_corrected'; $problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction'; } else { $problemFlag = 'ok'; $problemReason = 'Crossref indicates not retracted/corrected'; } } else { $problemFlag = 'unknown'; $problemReason = 'Crossref fetch failed or returned unexpected data'; } } else { $problemFlag = 'unknown'; $problemReason = 'DOI is empty'; } } else { $problemFlag = 'unknown'; $problemReason = 'Skip retraction check'; } // 2) 相关性判断(弱规则+证据命中) $backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [ 'several studies', 'many studies', 'the literature', 'the existing literature', 'has been reported', 'have been reported', 'it has been shown', 'previous studies', 'the study suggests', 'the literature suggests', 'in the literature', ]; $ctxLower = strtolower($contextText); $isBackground = false; foreach ($backgroundPhrases as $ph) { $ph = strtolower(trim((string)$ph)); if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) { $isBackground = true; break; } } $refTokens = $this->buildEvidenceTokens([ 'title' => $refTitle, 'author' => $refAuthor, 'journal' => $refJoura, 'year' => $refDateno, ]); $ctxTokens = $this->tokenize($contextText); $titleOverlap = 0.0; $authorHit = 0.0; $journalOverlap = 0.0; $yearHit = 0.0; $titleTokens = $refTokens['titleTokens'] ?? []; $authorTokens = $refTokens['authorTokens'] ?? []; $journalTokens = $refTokens['journalTokens'] ?? []; $yearToken = $refTokens['yearToken'] ?? ''; if (!empty($titleTokens)) { $inter = array_intersect($titleTokens, $ctxTokens); $titleOverlap = count($inter) / max(1, count($titleTokens)); } if (!empty($authorTokens)) { foreach ($authorTokens as $at) { if ($at !== '' && in_array($at, $ctxTokens, true)) { $authorHit = 1.0; break; } } } if (!empty($journalTokens)) { $interJ = array_intersect($journalTokens, $ctxTokens); $journalOverlap = count($interJ) / max(1, count($journalTokens)); } if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) { $yearHit = 1.0; } // 综合得分(保持解释性:越高越相关) $score = round(( 0.60 * $titleOverlap + 0.20 * $authorHit + 0.15 * $yearHit + 0.05 * $journalOverlap ), 4); $relevanceFlag = 'unsure'; $reasonParts = []; if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) { $relevanceFlag = 'related'; $reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap; } elseif ($score >= 0.25) { $relevanceFlag = 'unsure'; $reasonParts[] = 'evidence_score_mid=' . $score; } else { if ($isBackground) { $relevanceFlag = 'unsure_background'; $reasonParts[] = 'background_phrases_detected'; } else { $relevanceFlag = 'suspicious_unrelated'; $reasonParts[] = 'evidence_score_low=' . $score; } } $reasonParts[] = 'titleOverlap=' . $titleOverlap; $reasonParts[] = 'authorHit=' . $authorHit; $reasonParts[] = 'yearHit=' . $yearHit; $reasonParts[] = 'journalOverlap=' . $journalOverlap; $reason = implode('; ', $reasonParts); return [ 'problem_flag' => $problemFlag, 'problem_reason' => $problemReason, 'relevance_flag' => $relevanceFlag, 'relevance_score' => (float)$score, 'reason' => $reason, ]; } /** * 从 refer_doi / doilink 中抽取 DOI 字符串。 * @param string $referDoi * @param string $doilink * @return string */ private function extractDoiFromMeta(string $referDoi, string $doilink): string { $doi = trim($referDoi); if (!empty($doi)) { return $this->filterValidDoi($doi); } $link = trim($doilink); if ($link === '') return ''; // 常见:https://doi.org/10.xxxx/xxxx 或 http://doi.org/... if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) { $candidate = trim((string)$m[1]); return $this->filterValidDoi($candidate); } // 兜底:如果doilink本身就是doi格式 return $this->filterValidDoi($link); } /** * 构建证据 token(用于证据命中/相似度粗判) * @param array $src * @return array */ private function buildEvidenceTokens(array $src): array { $stop = [ 'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are', 'was','were','be','been','being','that','this','these','those','which','who','whom','it','its', 'we','our','us','they','their','them','i','you','your','he','she','his','her', 'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods', 'results','result','using','used','show','shown','demonstrated','demonstrate', ]; $titleTokens = $this->tokenize((string)($src['title'] ?? '')); $titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) { return !in_array($t, $stop, true) && mb_strlen($t) >= 4; })); $authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? '')); $authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) { $t = trim($t); if ($t === '') return ''; if (in_array($t, $stop, true)) return ''; return $t; }, $authorTokens)))); $journalTokens = $this->tokenize((string)($src['journal'] ?? '')); $journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) { return !in_array($t, $stop, true) && mb_strlen($t) >= 4; })); $yearToken = ''; $yearRaw = (string)($src['year'] ?? ''); if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) { $yearToken = (string)$m[1]; } return [ 'titleTokens' => $titleTokens, 'authorTokens' => $authorTokens, 'journalTokens' => $journalTokens, 'yearToken' => $yearToken, ]; } /** * 提取作者姓/缩写 token(简化版) * @param string $authorStr * @return array */ private function extractAuthorTokens(string $authorStr): array { $authorStr = trim($authorStr); if ($authorStr === '') return []; // 把常见分隔符拆开 $parts = preg_split('/[,;]| and /i', $authorStr); $tokens = []; foreach ($parts as $p) { $p = trim((string)$p); if ($p === '') continue; // 取最后一个词当作姓(例如 "Smith J" -> "Smith"),或取首段词 $words = preg_split('/\s+/', $p); if (empty($words)) continue; $cand = trim((string)end($words)); if ($cand === '') $cand = trim((string)($words[0] ?? '')); // 只保留字母/点号(去掉异常符号) $cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand); $cand = strtolower($cand); if ($cand !== '') { $tokens[] = $cand; } } // 去掉过短的 token $tokens = array_values(array_filter(array_unique($tokens), function ($t) { return mb_strlen($t) >= 4; })); return $tokens; } /** * 文本 tokenize(英文下的轻量分词) * @param string $text * @return array */ private function tokenize(string $text): array { $text = strtolower(trim($text)); if ($text === '') return []; $parts = preg_split('/[^a-z0-9]+/i', $text); $tokens = []; foreach ($parts as $p) { $p = trim((string)$p); if ($p === '') continue; // 保留较有信息量的 token if (mb_strlen($p) < 3) continue; $tokens[] = $p; } return array_values(array_unique($tokens)); } /** * 英文切句(轻量实现):按 .?! 分割,同时保留句内的 [n]。 * @param string $text * @return array */ private function splitEnglishSentences(string $text): array { $text = trim($text); if ($text === '') return []; // 先把换行统一为空格,避免断句被打断 $text = preg_replace('/\s+/u', ' ', $text); // 按句末标点断句:. ? ! 后面跟空格/结尾 $parts = preg_split('/(?<=[\.\?\!])\s+/', $text); $sentences = []; foreach ($parts as $p) { $p = trim((string)$p); if ($p === '') continue; $sentences[] = $p; } return $sentences; } }