332 lines
13 KiB
PHP
332 lines
13 KiB
PHP
<?php
|
||
|
||
namespace app\common;
|
||
|
||
/**
|
||
* 引用相关性检测服务(PubMed + embedding)
|
||
*
|
||
* 依赖:
|
||
* - PubmedService:用 DOI 抓取 title/abstract/mesh/publication_types
|
||
* - CrossrefService:撤稿/更正识别(补充)
|
||
*
|
||
* embedding:
|
||
* - 使用你们内部大模型的 embedding 接口(无需 token 付费,但速度慢)
|
||
* - 通过构造参数传入 embedding_url / headers / timeout
|
||
* - 内置文件缓存,减少重复 embedding 成本
|
||
*/
|
||
class CitationRelevanceService
|
||
{
|
||
private $pubmed;
|
||
private $crossref;
|
||
|
||
private $embeddingUrl = '';
|
||
private $embeddingHeaders = [];
|
||
private $timeout = 120;
|
||
private $chatUrl = '';
|
||
private $chatModel = '';
|
||
private $embeddingDim = 256;
|
||
private $chatMaxTokens = 1200;
|
||
|
||
public function __construct(PubmedService $pubmed = null, CrossrefService $crossref = null, array $config = [])
|
||
{
|
||
$this->pubmed = $pubmed ?: new PubmedService();
|
||
$this->crossref = $crossref ?: new CrossrefService();
|
||
|
||
if (isset($config['embedding_url'])) $this->embeddingUrl = (string)$config['embedding_url'];
|
||
if (isset($config['embedding_headers']) && is_array($config['embedding_headers'])) $this->embeddingHeaders = $config['embedding_headers'];
|
||
if (isset($config['timeout'])) $this->timeout = max(10, intval($config['timeout']));
|
||
if (isset($config['chat_url'])) $this->chatUrl = (string)$config['chat_url'];
|
||
if (isset($config['chat_model'])) $this->chatModel = (string)$config['chat_model'];
|
||
if (isset($config['embedding_dim'])) $this->embeddingDim = max(32, intval($config['embedding_dim']));
|
||
if (isset($config['chat_max_tokens'])) $this->chatMaxTokens = max(256, intval($config['chat_max_tokens']));
|
||
}
|
||
|
||
/**
|
||
* 单条引用相关性检测
|
||
*
|
||
* @param string $contextText 引用处上下文(英文)
|
||
* @param array $referRow production_article_refer 行(至少含 refer_doi/doilink/title/author/joura/dateno)
|
||
* @param array $options
|
||
* - sentence_is_background(bool) 若外部已判断为背景堆引用,可直接降级
|
||
* - sim_related(float) related 阈值,默认 0.75
|
||
* - sim_unsure(float) unsure 阈值,默认 0.60
|
||
* - check_retraction(bool) 是否检查撤稿/更正,默认 true
|
||
*/
|
||
public function checkOne(string $contextText, array $referRow, array $options = []): array
|
||
{
|
||
$contextText = trim($contextText);
|
||
$simRelated = isset($options['sim_related']) ? (float)$options['sim_related'] : 0.75;
|
||
$simUnsure = isset($options['sim_unsure']) ? (float)$options['sim_unsure'] : 0.60;
|
||
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
|
||
$isBackground = !empty($options['sentence_is_background']);
|
||
|
||
// 1) 问题条目(退稿/更正):先 Crossref(有 DOI 才能判断)
|
||
$problemFlag = 'unknown';
|
||
$problemReason = '';
|
||
if ($checkRetraction) {
|
||
$qc = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => true]);
|
||
$problemFlag = $qc['problem_flag'] ?? 'unknown';
|
||
$problemReason = $qc['problem_reason'] ?? '';
|
||
}
|
||
|
||
// 2) PubMed 抓取 abstract/mesh(提升语义)
|
||
$doi = $this->extractDoiFromRefer($referRow);
|
||
$pub = $doi ? $this->pubmed->fetchByDoi($doi) : null;
|
||
|
||
$pubText = '';
|
||
$pubTypes = [];
|
||
if ($pub) {
|
||
$pubTypes = $pub['publication_types'] ?? [];
|
||
$mesh = $pub['mesh_terms'] ?? [];
|
||
$pubText = trim(
|
||
($pub['title'] ?? '') . "\n" .
|
||
($pub['abstract'] ?? '') . "\n" .
|
||
(!empty($mesh) ? ('MeSH: ' . implode('; ', $mesh)) : '')
|
||
);
|
||
}
|
||
|
||
// 3) embedding 相似度(context vs pubmed_text),无 pubmed_text 则退化为 crossref 的证据法
|
||
if ($pubText !== '') {
|
||
$v1 = $this->embedCached($contextText);
|
||
$v2 = $this->embedCached($pubText);
|
||
$sim = ($v1 && $v2) ? $this->cosine($v1, $v2) : 0.0;
|
||
|
||
$relevanceFlag = 'unsure';
|
||
if ($sim >= $simRelated) {
|
||
$relevanceFlag = 'related';
|
||
} elseif ($sim >= $simUnsure) {
|
||
$relevanceFlag = 'unsure';
|
||
} else {
|
||
$relevanceFlag = $isBackground ? 'unsure_background' : 'suspicious_unrelated';
|
||
}
|
||
|
||
// PubMed 自身也能提示撤稿/更正(作为补充)
|
||
if ($checkRetraction && $problemFlag !== 'retracted_or_corrected' && !empty($pubTypes)) {
|
||
$ptLower = strtolower(implode(' | ', $pubTypes));
|
||
if (strpos($ptLower, 'retracted publication') !== false
|
||
|| strpos($ptLower, 'retraction of publication') !== false
|
||
|| strpos($ptLower, 'published erratum') !== false
|
||
) {
|
||
$problemFlag = 'retracted_or_corrected';
|
||
$problemReason = 'PubMed publication type indicates retraction/correction';
|
||
}
|
||
}
|
||
|
||
return [
|
||
'problem_flag' => $problemFlag,
|
||
'problem_reason' => $problemReason,
|
||
'relevance_flag' => $relevanceFlag,
|
||
'relevance_score' => round($sim, 4),
|
||
'reason' => $pubText !== '' ? 'embedding(context,pubmed_text)' : 'embedding_unavailable',
|
||
'pubmed' => [
|
||
'pmid' => $pub['pmid'] ?? '',
|
||
'year' => $pub['year'] ?? '',
|
||
'journal' => $pub['journal'] ?? '',
|
||
'publication_types' => $pubTypes,
|
||
],
|
||
];
|
||
}
|
||
|
||
// 退化:没有 PubMed 信息时,用 CrossrefService 证据法(会偏保守)
|
||
$fallback = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => false]);
|
||
$fallback['problem_flag'] = $problemFlag;
|
||
$fallback['problem_reason'] = $problemReason;
|
||
$fallback['reason'] = 'fallback_crossref_evidence; ' . ($fallback['reason'] ?? '');
|
||
return $fallback;
|
||
}
|
||
|
||
// ---------------- embedding ----------------
|
||
|
||
private function embedCached(string $text): ?array
|
||
{
|
||
$text = trim($text);
|
||
if ($text === '') return null;
|
||
|
||
$key = 'emb_' . sha1($text);
|
||
$cached = $this->cacheGet($key, 90 * 86400);
|
||
if (is_array($cached) && !empty($cached)) return $cached;
|
||
|
||
$vec = $this->embed($text);
|
||
if (is_array($vec) && !empty($vec)) {
|
||
$this->cacheSet($key, $vec);
|
||
return $vec;
|
||
}
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* 调用内部 embedding 接口
|
||
* 兼容返回格式:
|
||
* - OpenAI embeddings: {data:[{embedding:[...] }]}
|
||
* - {embedding:[...]}
|
||
* - 直接返回数组 [...]
|
||
*/
|
||
private function embed(string $text): ?array
|
||
{
|
||
// 1) 优先使用独立 embeddings 接口
|
||
if ($this->embeddingUrl !== '') {
|
||
$payload = json_encode(['text' => $text], JSON_UNESCAPED_UNICODE);
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $this->embeddingUrl);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_POST, true);
|
||
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
|
||
|
||
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
|
||
|
||
$res = curl_exec($ch);
|
||
curl_close($ch);
|
||
if (!is_string($res) || trim($res) === '') return null;
|
||
|
||
$decoded = json_decode($res, true);
|
||
if (is_array($decoded)) {
|
||
if (isset($decoded['data'][0]['embedding']) && is_array($decoded['data'][0]['embedding'])) {
|
||
return $this->normalizeVector($decoded['data'][0]['embedding']);
|
||
}
|
||
if (isset($decoded['embedding']) && is_array($decoded['embedding'])) {
|
||
return $this->normalizeVector($decoded['embedding']);
|
||
}
|
||
$isVec = isset($decoded[0]) && (is_float($decoded[0]) || is_int($decoded[0]));
|
||
if ($isVec) return $this->normalizeVector($decoded);
|
||
}
|
||
return null;
|
||
}
|
||
|
||
// 2) 没有 embeddings 接口时,使用 chat/completions 生成固定维度向量
|
||
if ($this->chatUrl === '' || $this->chatModel === '') {
|
||
return null;
|
||
}
|
||
|
||
$sys = "You are an embedding generator. Output ONLY valid JSON in this exact shape: {\"embedding\":[...]}.\n"
|
||
. "Rules:\n"
|
||
. "- embedding must be an array of exactly {$this->embeddingDim} floats\n"
|
||
. "- each float must be between -1 and 1\n"
|
||
. "- do not include any other keys or any extra text\n";
|
||
|
||
$payload = json_encode([
|
||
'model' => $this->chatModel,
|
||
'temperature' => 0,
|
||
'max_tokens' => $this->chatMaxTokens,
|
||
'messages' => [
|
||
['role' => 'system', 'content' => $sys],
|
||
['role' => 'user', 'content' => $text],
|
||
],
|
||
], JSON_UNESCAPED_UNICODE);
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $this->chatUrl);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_POST, true);
|
||
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
|
||
|
||
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
|
||
|
||
$res = curl_exec($ch);
|
||
curl_close($ch);
|
||
if (!is_string($res) || trim($res) === '') return null;
|
||
|
||
$decoded = json_decode($res, true);
|
||
$content = '';
|
||
if (is_array($decoded) && isset($decoded['choices'][0]['message']['content'])) {
|
||
$content = (string)$decoded['choices'][0]['message']['content'];
|
||
}
|
||
$content = trim($content);
|
||
if ($content === '') return null;
|
||
|
||
// content 可能被包裹在 ```json ... ```
|
||
if (preg_match('/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i', $content, $m)) {
|
||
$content = trim($m[1]);
|
||
}
|
||
$j = json_decode($content, true);
|
||
if (!is_array($j) || !isset($j['embedding']) || !is_array($j['embedding'])) {
|
||
return null;
|
||
}
|
||
|
||
$vec = $j['embedding'];
|
||
if (count($vec) !== $this->embeddingDim) {
|
||
return null;
|
||
}
|
||
return $this->normalizeVector($vec);
|
||
}
|
||
|
||
private function cosine(array $a, array $b): float
|
||
{
|
||
$n = min(count($a), count($b));
|
||
if ($n <= 0) return 0.0;
|
||
$dot = 0.0; $na = 0.0; $nb = 0.0;
|
||
for ($i = 0; $i < $n; $i++) {
|
||
$x = (float)$a[$i];
|
||
$y = (float)$b[$i];
|
||
$dot += $x * $y;
|
||
$na += $x * $x;
|
||
$nb += $y * $y;
|
||
}
|
||
if ($na <= 0.0 || $nb <= 0.0) return 0.0;
|
||
return $dot / (sqrt($na) * sqrt($nb));
|
||
}
|
||
|
||
private function normalizeVector(array $v): array
|
||
{
|
||
$sum = 0.0;
|
||
$out = [];
|
||
foreach ($v as $x) {
|
||
$fx = (float)$x;
|
||
$out[] = $fx;
|
||
$sum += $fx * $fx;
|
||
}
|
||
if ($sum <= 0.0) return $out;
|
||
$norm = sqrt($sum);
|
||
for ($i = 0; $i < count($out); $i++) {
|
||
$out[$i] = $out[$i] / $norm;
|
||
}
|
||
return $out;
|
||
}
|
||
|
||
private function extractDoiFromRefer(array $referRow): string
|
||
{
|
||
// 复用 CrossrefService 内部逻辑(通过 qcCitation 的抽取函数不可直接访问,所以这里简单实现)
|
||
$doi = trim((string)($referRow['refer_doi'] ?? ''));
|
||
if ($doi !== '') return $doi;
|
||
|
||
$doilink = trim((string)($referRow['doilink'] ?? ''));
|
||
if ($doilink === '') return '';
|
||
if (preg_match('#doi\\.org/([^?#]+)#i', $doilink, $m)) {
|
||
return trim((string)$m[1]);
|
||
}
|
||
return $doilink;
|
||
}
|
||
|
||
// ---------------- cache ----------------
|
||
|
||
private function cacheDir(): string
|
||
{
|
||
return rtrim(ROOT_PATH, '/') . '/runtime/embed_cache';
|
||
}
|
||
|
||
private function cacheGet(string $key, int $ttlSeconds)
|
||
{
|
||
$file = $this->cacheDir() . '/' . $key . '.json';
|
||
if (!is_file($file)) return null;
|
||
$mtime = filemtime($file);
|
||
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
|
||
$raw = @file_get_contents($file);
|
||
$decoded = json_decode((string)$raw, true);
|
||
return $decoded;
|
||
}
|
||
|
||
private function cacheSet(string $key, $value): void
|
||
{
|
||
$dir = $this->cacheDir();
|
||
if (!is_dir($dir)) @mkdir($dir, 0777, true);
|
||
$file = $dir . '/' . $key . '.json';
|
||
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
|
||
}
|
||
}
|
||
|