自动推广

This commit is contained in:
wangjinlei
2026-04-03 11:45:45 +08:00
parent 22947a56a4
commit a802b2e923
11 changed files with 2240 additions and 36 deletions

View File

@@ -0,0 +1,331 @@
<?php
namespace app\common;
/**
* 引用相关性检测服务PubMed + embedding
*
* 依赖:
* - PubmedService用 DOI 抓取 title/abstract/mesh/publication_types
* - CrossrefService撤稿/更正识别(补充)
*
* embedding
* - 使用你们内部大模型的 embedding 接口(无需 token 付费,但速度慢)
* - 通过构造参数传入 embedding_url / headers / timeout
* - 内置文件缓存,减少重复 embedding 成本
*/
class CitationRelevanceService
{
private $pubmed;
private $crossref;
private $embeddingUrl = '';
private $embeddingHeaders = [];
private $timeout = 120;
private $chatUrl = '';
private $chatModel = '';
private $embeddingDim = 256;
private $chatMaxTokens = 1200;
public function __construct(PubmedService $pubmed = null, CrossrefService $crossref = null, array $config = [])
{
$this->pubmed = $pubmed ?: new PubmedService();
$this->crossref = $crossref ?: new CrossrefService();
if (isset($config['embedding_url'])) $this->embeddingUrl = (string)$config['embedding_url'];
if (isset($config['embedding_headers']) && is_array($config['embedding_headers'])) $this->embeddingHeaders = $config['embedding_headers'];
if (isset($config['timeout'])) $this->timeout = max(10, intval($config['timeout']));
if (isset($config['chat_url'])) $this->chatUrl = (string)$config['chat_url'];
if (isset($config['chat_model'])) $this->chatModel = (string)$config['chat_model'];
if (isset($config['embedding_dim'])) $this->embeddingDim = max(32, intval($config['embedding_dim']));
if (isset($config['chat_max_tokens'])) $this->chatMaxTokens = max(256, intval($config['chat_max_tokens']));
}
/**
* 单条引用相关性检测
*
* @param string $contextText 引用处上下文(英文)
* @param array $referRow production_article_refer 行(至少含 refer_doi/doilink/title/author/joura/dateno
* @param array $options
* - sentence_is_background(bool) 若外部已判断为背景堆引用,可直接降级
* - sim_related(float) related 阈值,默认 0.75
* - sim_unsure(float) unsure 阈值,默认 0.60
* - check_retraction(bool) 是否检查撤稿/更正,默认 true
*/
public function checkOne(string $contextText, array $referRow, array $options = []): array
{
$contextText = trim($contextText);
$simRelated = isset($options['sim_related']) ? (float)$options['sim_related'] : 0.75;
$simUnsure = isset($options['sim_unsure']) ? (float)$options['sim_unsure'] : 0.60;
$checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;
$isBackground = !empty($options['sentence_is_background']);
// 1) 问题条目(退稿/更正):先 Crossref有 DOI 才能判断)
$problemFlag = 'unknown';
$problemReason = '';
if ($checkRetraction) {
$qc = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => true]);
$problemFlag = $qc['problem_flag'] ?? 'unknown';
$problemReason = $qc['problem_reason'] ?? '';
}
// 2) PubMed 抓取 abstract/mesh提升语义
$doi = $this->extractDoiFromRefer($referRow);
$pub = $doi ? $this->pubmed->fetchByDoi($doi) : null;
$pubText = '';
$pubTypes = [];
if ($pub) {
$pubTypes = $pub['publication_types'] ?? [];
$mesh = $pub['mesh_terms'] ?? [];
$pubText = trim(
($pub['title'] ?? '') . "\n" .
($pub['abstract'] ?? '') . "\n" .
(!empty($mesh) ? ('MeSH: ' . implode('; ', $mesh)) : '')
);
}
// 3) embedding 相似度context vs pubmed_text无 pubmed_text 则退化为 crossref 的证据法
if ($pubText !== '') {
$v1 = $this->embedCached($contextText);
$v2 = $this->embedCached($pubText);
$sim = ($v1 && $v2) ? $this->cosine($v1, $v2) : 0.0;
$relevanceFlag = 'unsure';
if ($sim >= $simRelated) {
$relevanceFlag = 'related';
} elseif ($sim >= $simUnsure) {
$relevanceFlag = 'unsure';
} else {
$relevanceFlag = $isBackground ? 'unsure_background' : 'suspicious_unrelated';
}
// PubMed 自身也能提示撤稿/更正(作为补充)
if ($checkRetraction && $problemFlag !== 'retracted_or_corrected' && !empty($pubTypes)) {
$ptLower = strtolower(implode(' | ', $pubTypes));
if (strpos($ptLower, 'retracted publication') !== false
|| strpos($ptLower, 'retraction of publication') !== false
|| strpos($ptLower, 'published erratum') !== false
) {
$problemFlag = 'retracted_or_corrected';
$problemReason = 'PubMed publication type indicates retraction/correction';
}
}
return [
'problem_flag' => $problemFlag,
'problem_reason' => $problemReason,
'relevance_flag' => $relevanceFlag,
'relevance_score' => round($sim, 4),
'reason' => $pubText !== '' ? 'embedding(context,pubmed_text)' : 'embedding_unavailable',
'pubmed' => [
'pmid' => $pub['pmid'] ?? '',
'year' => $pub['year'] ?? '',
'journal' => $pub['journal'] ?? '',
'publication_types' => $pubTypes,
],
];
}
// 退化:没有 PubMed 信息时,用 CrossrefService 证据法(会偏保守)
$fallback = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => false]);
$fallback['problem_flag'] = $problemFlag;
$fallback['problem_reason'] = $problemReason;
$fallback['reason'] = 'fallback_crossref_evidence; ' . ($fallback['reason'] ?? '');
return $fallback;
}
// ---------------- embedding ----------------
private function embedCached(string $text): ?array
{
$text = trim($text);
if ($text === '') return null;
$key = 'emb_' . sha1($text);
$cached = $this->cacheGet($key, 90 * 86400);
if (is_array($cached) && !empty($cached)) return $cached;
$vec = $this->embed($text);
if (is_array($vec) && !empty($vec)) {
$this->cacheSet($key, $vec);
return $vec;
}
return null;
}
/**
* 调用内部 embedding 接口
* 兼容返回格式:
* - OpenAI embeddings: {data:[{embedding:[...] }]}
* - {embedding:[...]}
* - 直接返回数组 [...]
*/
private function embed(string $text): ?array
{
// 1) 优先使用独立 embeddings 接口
if ($this->embeddingUrl !== '') {
$payload = json_encode(['text' => $text], JSON_UNESCAPED_UNICODE);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->embeddingUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$res = curl_exec($ch);
curl_close($ch);
if (!is_string($res) || trim($res) === '') return null;
$decoded = json_decode($res, true);
if (is_array($decoded)) {
if (isset($decoded['data'][0]['embedding']) && is_array($decoded['data'][0]['embedding'])) {
return $this->normalizeVector($decoded['data'][0]['embedding']);
}
if (isset($decoded['embedding']) && is_array($decoded['embedding'])) {
return $this->normalizeVector($decoded['embedding']);
}
$isVec = isset($decoded[0]) && (is_float($decoded[0]) || is_int($decoded[0]));
if ($isVec) return $this->normalizeVector($decoded);
}
return null;
}
// 2) 没有 embeddings 接口时,使用 chat/completions 生成固定维度向量
if ($this->chatUrl === '' || $this->chatModel === '') {
return null;
}
$sys = "You are an embedding generator. Output ONLY valid JSON in this exact shape: {\"embedding\":[...]}.\n"
. "Rules:\n"
. "- embedding must be an array of exactly {$this->embeddingDim} floats\n"
. "- each float must be between -1 and 1\n"
. "- do not include any other keys or any extra text\n";
$payload = json_encode([
'model' => $this->chatModel,
'temperature' => 0,
'max_tokens' => $this->chatMaxTokens,
'messages' => [
['role' => 'system', 'content' => $sys],
['role' => 'user', 'content' => $text],
],
], JSON_UNESCAPED_UNICODE);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->chatUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
$headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$res = curl_exec($ch);
curl_close($ch);
if (!is_string($res) || trim($res) === '') return null;
$decoded = json_decode($res, true);
$content = '';
if (is_array($decoded) && isset($decoded['choices'][0]['message']['content'])) {
$content = (string)$decoded['choices'][0]['message']['content'];
}
$content = trim($content);
if ($content === '') return null;
// content 可能被包裹在 ```json ... ```
if (preg_match('/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i', $content, $m)) {
$content = trim($m[1]);
}
$j = json_decode($content, true);
if (!is_array($j) || !isset($j['embedding']) || !is_array($j['embedding'])) {
return null;
}
$vec = $j['embedding'];
if (count($vec) !== $this->embeddingDim) {
return null;
}
return $this->normalizeVector($vec);
}
private function cosine(array $a, array $b): float
{
$n = min(count($a), count($b));
if ($n <= 0) return 0.0;
$dot = 0.0; $na = 0.0; $nb = 0.0;
for ($i = 0; $i < $n; $i++) {
$x = (float)$a[$i];
$y = (float)$b[$i];
$dot += $x * $y;
$na += $x * $x;
$nb += $y * $y;
}
if ($na <= 0.0 || $nb <= 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeVector(array $v): array
{
$sum = 0.0;
$out = [];
foreach ($v as $x) {
$fx = (float)$x;
$out[] = $fx;
$sum += $fx * $fx;
}
if ($sum <= 0.0) return $out;
$norm = sqrt($sum);
for ($i = 0; $i < count($out); $i++) {
$out[$i] = $out[$i] / $norm;
}
return $out;
}
private function extractDoiFromRefer(array $referRow): string
{
// 复用 CrossrefService 内部逻辑(通过 qcCitation 的抽取函数不可直接访问,所以这里简单实现)
$doi = trim((string)($referRow['refer_doi'] ?? ''));
if ($doi !== '') return $doi;
$doilink = trim((string)($referRow['doilink'] ?? ''));
if ($doilink === '') return '';
if (preg_match('#doi\\.org/([^?#]+)#i', $doilink, $m)) {
return trim((string)$m[1]);
}
return $doilink;
}
// ---------------- cache ----------------
private function cacheDir(): string
{
return rtrim(ROOT_PATH, '/') . '/runtime/embed_cache';
}
private function cacheGet(string $key, int $ttlSeconds)
{
$file = $this->cacheDir() . '/' . $key . '.json';
if (!is_file($file)) return null;
$mtime = filemtime($file);
if (!$mtime || (time() - $mtime) > $ttlSeconds) return null;
$raw = @file_get_contents($file);
$decoded = json_decode((string)$raw, true);
return $decoded;
}
private function cacheSet(string $key, $value): void
{
$dir = $this->cacheDir();
if (!is_dir($dir)) @mkdir($dir, 0777, true);
$file = $dir . '/' . $key . '.json';
@file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE));
}
}