Files
tougao/application/common/CitationRelevanceService.php
2026-04-03 11:45:45 +08:00

332 lines
13 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
namespace app\common;
/**
* 引用相关性检测服务PubMed + embedding
*
* 依赖:
* - PubmedService用 DOI 抓取 title/abstract/mesh/publication_types
* - CrossrefService撤稿/更正识别(补充)
*
* embedding
* - 使用你们内部大模型的 embedding 接口(无需 token 付费,但速度慢)
* - 通过构造参数传入 embedding_url / headers / timeout
* - 内置文件缓存,减少重复 embedding 成本
*/
class CitationRelevanceService
{
private $pubmed;
private $crossref;
private $embeddingUrl = '';
private $embeddingHeaders = [];
private $timeout = 120;
private $chatUrl = '';
private $chatModel = '';
private $embeddingDim = 256;
private $chatMaxTokens = 1200;
public function __construct(PubmedService $pubmed = null, CrossrefService $crossref = null, array $config = [])
{
$this->pubmed = $pubmed ?: new PubmedService();
$this->crossref = $crossref ?: new CrossrefService();
if (isset($config['embedding_url'])) $this->embeddingUrl = (string)$config['embedding_url'];
if (isset($config['embedding_headers']) && is_array($config['embedding_headers'])) $this->embeddingHeaders = $config['embedding_headers'];
if (isset($config['timeout'])) $this->timeout = max(10, intval($config['timeout']));
if (isset($config['chat_url'])) $this->chatUrl = (string)$config['chat_url'];
if (isset($config['chat_model'])) $this->chatModel = (string)$config['chat_model'];
if (isset($config['embedding_dim'])) $this->embeddingDim = max(32, intval($config['embedding_dim']));
if (isset($config['chat_max_tokens'])) $this->chatMaxTokens = max(256, intval($config['chat_max_tokens']));
}
/**
* 单条引用相关性检测
*
* @param string $contextText 引用处上下文(英文)
* @param array $referRow production_article_refer 行(至少含 refer_doi/doilink/title/author/joura/dateno
* @param array $options
* - sentence_is_background(bool) 若外部已判断为背景堆引用,可直接降级
* - sim_related(float) related 阈值,默认 0.75
* - sim_unsure(float) unsure 阈值,默认 0.60
* - check_retraction(bool) 是否检查撤稿/更正,默认 true
*/
public function checkOne(string $contextText, array $referRow, array $options = []): array
{
$contextText = trim($contextText);
$simRelated = isset($options['sim_related']) ? (float)$options['sim_related'] : 0.75;
$simUnsure = isset($options['sim_unsure']) ? (float)$options['sim_unsure'] : 0.60;
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
$isBackground = !empty($options['sentence_is_background']);
// 1) 问题条目(退稿/更正):先 Crossref有 DOI 才能判断)
$problemFlag = 'unknown';
$problemReason = '';
if ($checkRetraction) {
$qc = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => true]);
$problemFlag = $qc['problem_flag'] ?? 'unknown';
$problemReason = $qc['problem_reason'] ?? '';
}
// 2) PubMed 抓取 abstract/mesh提升语义
$doi = $this->extractDoiFromRefer($referRow);
$pub = $doi ? $this->pubmed->fetchByDoi($doi) : null;
$pubText = '';
$pubTypes = [];
if ($pub) {
$pubTypes = $pub['publication_types'] ?? [];
$mesh = $pub['mesh_terms'] ?? [];
$pubText = trim(
($pub['title'] ?? '') . "\n" .
($pub['abstract'] ?? '') . "\n" .
(!empty($mesh) ? ('MeSH: ' . implode('; ', $mesh)) : '')
);
}
// 3) embedding 相似度context vs pubmed_text无 pubmed_text 则退化为 crossref 的证据法
if ($pubText !== '') {
$v1 = $this->embedCached($contextText);
$v2 = $this->embedCached($pubText);
$sim = ($v1 && $v2) ? $this->cosine($v1, $v2) : 0.0;
$relevanceFlag = 'unsure';
if ($sim >= $simRelated) {
$relevanceFlag = 'related';
} elseif ($sim >= $simUnsure) {
$relevanceFlag = 'unsure';
} else {
$relevanceFlag = $isBackground ? 'unsure_background' : 'suspicious_unrelated';
}
// PubMed 自身也能提示撤稿/更正(作为补充)
if ($checkRetraction && $problemFlag !== 'retracted_or_corrected' && !empty($pubTypes)) {
$ptLower = strtolower(implode(' | ', $pubTypes));
if (strpos($ptLower, 'retracted publication') !== false
|| strpos($ptLower, 'retraction of publication') !== false
|| strpos($ptLower, 'published erratum') !== false
) {
$problemFlag = 'retracted_or_corrected';
$problemReason = 'PubMed publication type indicates retraction/correction';
}
}
return [
'problem_flag' => $problemFlag,
'problem_reason' => $problemReason,
'relevance_flag' => $relevanceFlag,
'relevance_score' => round($sim, 4),
'reason' => $pubText !== '' ? 'embedding(context,pubmed_text)' : 'embedding_unavailable',
'pubmed' => [
'pmid' => $pub['pmid'] ?? '',
'year' => $pub['year'] ?? '',
'journal' => $pub['journal'] ?? '',
'publication_types' => $pubTypes,
],
];
}
// 退化:没有 PubMed 信息时,用 CrossrefService 证据法(会偏保守)
$fallback = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => false]);
$fallback['problem_flag'] = $problemFlag;
$fallback['problem_reason'] = $problemReason;
$fallback['reason'] = 'fallback_crossref_evidence; ' . ($fallback['reason'] ?? '');
return $fallback;
}
// ---------------- embedding ----------------
private function embedCached(string $text): ?array
{
$text = trim($text);
if ($text === '') return null;
$key = 'emb_' . sha1($text);
$cached = $this->cacheGet($key, 90 * 86400);
if (is_array($cached) && !empty($cached)) return $cached;
$vec = $this->embed($text);
if (is_array($vec) && !empty($vec)) {
$this->cacheSet($key, $vec);
return $vec;
}
return null;
}
/**
* 调用内部 embedding 接口
* 兼容返回格式:
* - OpenAI embeddings: {data:[{embedding:[...] }]}
* - {embedding:[...]}
* - 直接返回数组 [...]
*/
private function embed(string $text): ?array
{
// 1) 优先使用独立 embeddings 接口
if ($this->embeddingUrl !== '') {
$payload = json_encode(['text' => $text], JSON_UNESCAPED_UNICODE);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->embeddingUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$res = curl_exec($ch);
curl_close($ch);
if (!is_string($res) || trim($res) === '') return null;
$decoded = json_decode($res, true);
if (is_array($decoded)) {
if (isset($decoded['data'][0]['embedding']) && is_array($decoded['data'][0]['embedding'])) {
return $this->normalizeVector($decoded['data'][0]['embedding']);
}
if (isset($decoded['embedding']) && is_array($decoded['embedding'])) {
return $this->normalizeVector($decoded['embedding']);
}
$isVec = isset($decoded[0]) && (is_float($decoded[0]) || is_int($decoded[0]));
if ($isVec) return $this->normalizeVector($decoded);
}
return null;
}
// 2) 没有 embeddings 接口时,使用 chat/completions 生成固定维度向量
if ($this->chatUrl === '' || $this->chatModel === '') {
return null;
}
$sys = "You are an embedding generator. Output ONLY valid JSON in this exact shape: {\"embedding\":[...]}.\n"
. "Rules:\n"
. "- embedding must be an array of exactly {$this->embeddingDim} floats\n"
. "- each float must be between -1 and 1\n"
. "- do not include any other keys or any extra text\n";
$payload = json_encode([
'model' => $this->chatModel,
'temperature' => 0,
'max_tokens' => $this->chatMaxTokens,
'messages' => [
['role' => 'system', 'content' => $sys],
['role' => 'user', 'content' => $text],
],
], JSON_UNESCAPED_UNICODE);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->chatUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$res = curl_exec($ch);
curl_close($ch);
if (!is_string($res) || trim($res) === '') return null;
$decoded = json_decode($res, true);
$content = '';
if (is_array($decoded) && isset($decoded['choices'][0]['message']['content'])) {
$content = (string)$decoded['choices'][0]['message']['content'];
}
$content = trim($content);
if ($content === '') return null;
// content 可能被包裹在 ```json ... ```
if (preg_match('/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i', $content, $m)) {
$content = trim($m[1]);
}
$j = json_decode($content, true);
if (!is_array($j) || !isset($j['embedding']) || !is_array($j['embedding'])) {
return null;
}
$vec = $j['embedding'];
if (count($vec) !== $this->embeddingDim) {
return null;
}
return $this->normalizeVector($vec);
}
private function cosine(array $a, array $b): float
{
$n = min(count($a), count($b));
if ($n <= 0) return 0.0;
$dot = 0.0; $na = 0.0; $nb = 0.0;
for ($i = 0; $i < $n; $i++) {
$x = (float)$a[$i];
$y = (float)$b[$i];
$dot += $x * $y;
$na += $x * $x;
$nb += $y * $y;
}
if ($na <= 0.0 || $nb <= 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeVector(array $v): array
{
$sum = 0.0;
$out = [];
foreach ($v as $x) {
$fx = (float)$x;
$out[] = $fx;
$sum += $fx * $fx;
}
if ($sum <= 0.0) return $out;
$norm = sqrt($sum);
for ($i = 0; $i < count($out); $i++) {
$out[$i] = $out[$i] / $norm;
}
return $out;
}
private function extractDoiFromRefer(array $referRow): string
{
// 复用 CrossrefService 内部逻辑(通过 qcCitation 的抽取函数不可直接访问,所以这里简单实现)
$doi = trim((string)($referRow['refer_doi'] ?? ''));
if ($doi !== '') return $doi;
$doilink = trim((string)($referRow['doilink'] ?? ''));
if ($doilink === '') return '';
if (preg_match('#doi\\.org/([^?#]+)#i', $doilink, $m)) {
return trim((string)$m[1]);
}
return $doilink;
}
// ---------------- cache ----------------
private function cacheDir(): string
{
return rtrim(ROOT_PATH, '/') . '/runtime/embed_cache';
}
private function cacheGet(string $key, int $ttlSeconds)
{
$file = $this->cacheDir() . '/' . $key . '.json';
if (!is_file($file)) return null;
$mtime = filemtime($file);
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
$raw = @file_get_contents($file);
$decoded = json_decode((string)$raw, true);
return $decoded;
}
private function cacheSet(string $key, $value): void
{
$dir = $this->cacheDir();
if (!is_dir($dir)) @mkdir($dir, 0777, true);
$file = $dir . '/' . $key . '.json';
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
}
}