pubmed = $pubmed ?: new PubmedService(); $this->crossref = $crossref ?: new CrossrefService(); if (isset($config['embedding_url'])) $this->embeddingUrl = (string)$config['embedding_url']; if (isset($config['embedding_headers']) && is_array($config['embedding_headers'])) $this->embeddingHeaders = $config['embedding_headers']; if (isset($config['timeout'])) $this->timeout = max(10, intval($config['timeout'])); if (isset($config['chat_url'])) $this->chatUrl = (string)$config['chat_url']; if (isset($config['chat_model'])) $this->chatModel = (string)$config['chat_model']; if (isset($config['embedding_dim'])) $this->embeddingDim = max(32, intval($config['embedding_dim'])); if (isset($config['chat_max_tokens'])) $this->chatMaxTokens = max(256, intval($config['chat_max_tokens'])); } /** * 单条引用相关性检测 * * @param string $contextText 引用处上下文(英文) * @param array $referRow production_article_refer 行(至少含 refer_doi/doilink/title/author/joura/dateno) * @param array $options * - sentence_is_background(bool) 若外部已判断为背景堆引用,可直接降级 * - sim_related(float) related 阈值,默认 0.75 * - sim_unsure(float) unsure 阈值,默认 0.60 * - check_retraction(bool) 是否检查撤稿/更正,默认 true */ public function checkOne(string $contextText, array $referRow, array $options = []): array { $contextText = trim($contextText); $simRelated = isset($options['sim_related']) ? (float)$options['sim_related'] : 0.75; $simUnsure = isset($options['sim_unsure']) ? (float)$options['sim_unsure'] : 0.60; $checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true; $isBackground = !empty($options['sentence_is_background']); // 1) 问题条目(退稿/更正):先 Crossref(有 DOI 才能判断) $problemFlag = 'unknown'; $problemReason = ''; if ($checkRetraction) { $qc = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => true]); $problemFlag = $qc['problem_flag'] ?? 'unknown'; $problemReason = $qc['problem_reason'] ?? ''; } // 2) PubMed 抓取 abstract/mesh(提升语义) $doi = $this->extractDoiFromRefer($referRow); $pub = $doi ? $this->pubmed->fetchByDoi($doi) : null; $pubText = ''; $pubTypes = []; if ($pub) { $pubTypes = $pub['publication_types'] ?? []; $mesh = $pub['mesh_terms'] ?? []; $pubText = trim( ($pub['title'] ?? '') . "\n" . ($pub['abstract'] ?? '') . "\n" . (!empty($mesh) ? ('MeSH: ' . implode('; ', $mesh)) : '') ); } // 3) embedding 相似度(context vs pubmed_text),无 pubmed_text 则退化为 crossref 的证据法 if ($pubText !== '') { $v1 = $this->embedCached($contextText); $v2 = $this->embedCached($pubText); $sim = ($v1 && $v2) ? $this->cosine($v1, $v2) : 0.0; $relevanceFlag = 'unsure'; if ($sim >= $simRelated) { $relevanceFlag = 'related'; } elseif ($sim >= $simUnsure) { $relevanceFlag = 'unsure'; } else { $relevanceFlag = $isBackground ? 'unsure_background' : 'suspicious_unrelated'; } // PubMed 自身也能提示撤稿/更正(作为补充) if ($checkRetraction && $problemFlag !== 'retracted_or_corrected' && !empty($pubTypes)) { $ptLower = strtolower(implode(' | ', $pubTypes)); if (strpos($ptLower, 'retracted publication') !== false || strpos($ptLower, 'retraction of publication') !== false || strpos($ptLower, 'published erratum') !== false ) { $problemFlag = 'retracted_or_corrected'; $problemReason = 'PubMed publication type indicates retraction/correction'; } } return [ 'problem_flag' => $problemFlag, 'problem_reason' => $problemReason, 'relevance_flag' => $relevanceFlag, 'relevance_score' => round($sim, 4), 'reason' => $pubText !== '' ? 'embedding(context,pubmed_text)' : 'embedding_unavailable', 'pubmed' => [ 'pmid' => $pub['pmid'] ?? '', 'year' => $pub['year'] ?? '', 'journal' => $pub['journal'] ?? '', 'publication_types' => $pubTypes, ], ]; } // 退化:没有 PubMed 信息时,用 CrossrefService 证据法(会偏保守) $fallback = $this->crossref->qcCitation($contextText, $referRow, ['check_retraction' => false]); $fallback['problem_flag'] = $problemFlag; $fallback['problem_reason'] = $problemReason; $fallback['reason'] = 'fallback_crossref_evidence; ' . ($fallback['reason'] ?? ''); return $fallback; } // ---------------- embedding ---------------- private function embedCached(string $text): ?array { $text = trim($text); if ($text === '') return null; $key = 'emb_' . sha1($text); $cached = $this->cacheGet($key, 90 * 86400); if (is_array($cached) && !empty($cached)) return $cached; $vec = $this->embed($text); if (is_array($vec) && !empty($vec)) { $this->cacheSet($key, $vec); return $vec; } return null; } /** * 调用内部 embedding 接口 * 兼容返回格式: * - OpenAI embeddings: {data:[{embedding:[...] }]} * - {embedding:[...]} * - 直接返回数组 [...] */ private function embed(string $text): ?array { // 1) 优先使用独立 embeddings 接口 if ($this->embeddingUrl !== '') { $payload = json_encode(['text' => $text], JSON_UNESCAPED_UNICODE); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $this->embeddingUrl); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); $headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $res = curl_exec($ch); curl_close($ch); if (!is_string($res) || trim($res) === '') return null; $decoded = json_decode($res, true); if (is_array($decoded)) { if (isset($decoded['data'][0]['embedding']) && is_array($decoded['data'][0]['embedding'])) { return $this->normalizeVector($decoded['data'][0]['embedding']); } if (isset($decoded['embedding']) && is_array($decoded['embedding'])) { return $this->normalizeVector($decoded['embedding']); } $isVec = isset($decoded[0]) && (is_float($decoded[0]) || is_int($decoded[0])); if ($isVec) return $this->normalizeVector($decoded); } return null; } // 2) 没有 embeddings 接口时,使用 chat/completions 生成固定维度向量 if ($this->chatUrl === '' || $this->chatModel === '') { return null; } $sys = "You are an embedding generator. Output ONLY valid JSON in this exact shape: {\"embedding\":[...]}.\n" . "Rules:\n" . "- embedding must be an array of exactly {$this->embeddingDim} floats\n" . "- each float must be between -1 and 1\n" . "- do not include any other keys or any extra text\n"; $payload = json_encode([ 'model' => $this->chatModel, 'temperature' => 0, 'max_tokens' => $this->chatMaxTokens, 'messages' => [ ['role' => 'system', 'content' => $sys], ['role' => 'user', 'content' => $text], ], ], JSON_UNESCAPED_UNICODE); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $this->chatUrl); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, $payload); $headers = array_merge(['Content-Type: application/json'], $this->embeddingHeaders); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); $res = curl_exec($ch); curl_close($ch); if (!is_string($res) || trim($res) === '') return null; $decoded = json_decode($res, true); $content = ''; if (is_array($decoded) && isset($decoded['choices'][0]['message']['content'])) { $content = (string)$decoded['choices'][0]['message']['content']; } $content = trim($content); if ($content === '') return null; // content 可能被包裹在 ```json ... ``` if (preg_match('/```(?:json)?\\s*([\\s\\S]*?)\\s*```/i', $content, $m)) { $content = trim($m[1]); } $j = json_decode($content, true); if (!is_array($j) || !isset($j['embedding']) || !is_array($j['embedding'])) { return null; } $vec = $j['embedding']; if (count($vec) !== $this->embeddingDim) { return null; } return $this->normalizeVector($vec); } private function cosine(array $a, array $b): float { $n = min(count($a), count($b)); if ($n <= 0) return 0.0; $dot = 0.0; $na = 0.0; $nb = 0.0; for ($i = 0; $i < $n; $i++) { $x = (float)$a[$i]; $y = (float)$b[$i]; $dot += $x * $y; $na += $x * $x; $nb += $y * $y; } if ($na <= 0.0 || $nb <= 0.0) return 0.0; return $dot / (sqrt($na) * sqrt($nb)); } private function normalizeVector(array $v): array { $sum = 0.0; $out = []; foreach ($v as $x) { $fx = (float)$x; $out[] = $fx; $sum += $fx * $fx; } if ($sum <= 0.0) return $out; $norm = sqrt($sum); for ($i = 0; $i < count($out); $i++) { $out[$i] = $out[$i] / $norm; } return $out; } private function extractDoiFromRefer(array $referRow): string { // 复用 CrossrefService 内部逻辑(通过 qcCitation 的抽取函数不可直接访问,所以这里简单实现) $doi = trim((string)($referRow['refer_doi'] ?? '')); if ($doi !== '') return $doi; $doilink = trim((string)($referRow['doilink'] ?? '')); if ($doilink === '') return ''; if (preg_match('#doi\\.org/([^?#]+)#i', $doilink, $m)) { return trim((string)$m[1]); } return $doilink; } // ---------------- cache ---------------- private function cacheDir(): string { return rtrim(ROOT_PATH, '/') . '/runtime/embed_cache'; } private function cacheGet(string $key, int $ttlSeconds) { $file = $this->cacheDir() . '/' . $key . '.json'; if (!is_file($file)) return null; $mtime = filemtime($file); if (!$mtime || (time() - $mtime) > $ttlSeconds) return null; $raw = @file_get_contents($file); $decoded = json_decode((string)$raw, true); return $decoded; } private function cacheSet(string $key, $value): void { $dir = $this->cacheDir(); if (!is_dir($dir)) @mkdir($dir, 0777, true); $file = $dir . '/' . $key . '.json'; @file_put_contents($file, json_encode($value, JSON_UNESCAPED_UNICODE)); } }