tougao/application/common/CrossrefService.php

<?php

namespace app\common;

/**
 * Crossref API 工具类
 *
 * 说明：
 * - 仿照 application/api/controller/Crossrefdoi.php 的实现风格抽成 Service
 * - 仅做「请求 + 解析」；不包含任何数据库读写
 */
class CrossrefService
{
    // 配置项
    private $mailto = ''; // 邮箱（提升优先级）
    private $timeout = 15; // 请求超时（秒）
    private $maxRetry = 2; // 单个DOI最大重试次数
    private $crossrefUrl = "https://api.crossref.org/works/"; // 接口地址

    public function __construct($config = [])
    {
        if (is_array($config)) {
            if (isset($config['mailto'])) $this->mailto = (string)$config['mailto'];
            if (isset($config['timeout'])) $this->timeout = intval($config['timeout']);
            if (isset($config['maxRetry'])) $this->maxRetry = intval($config['maxRetry']);
            if (isset($config['crossrefUrl'])) $this->crossrefUrl = (string)$config['crossrefUrl'];
        }
    }

    public function setMailto($mailto)
    {
        $this->mailto = (string)$mailto;
        return $this;
    }

    /**
     * 引用标号转换工具：正文里的 [n] 对应 production_article_refer.index = n-1（index 从 0 开始）。
     *
     * @param int $citationMark 正文引用编号，如 13（来自 [13]）
     * @return int production_article_refer.index，如 12
     */
    public function referIndexFromCitationMark(int $citationMark): int
    {
        $citationMark = intval($citationMark);
        return max(0, $citationMark - 1);
    }

    /**
     * 反向转换工具：production_article_refer.index（从 0 开始）→ 正文引用编号 [n]。
     *
     * @param int $referIndex production_article_refer.index，如 12
     * @return int 正文引用编号 n，如 13
     */
    public function citationMarkFromReferIndex(int $referIndex): int
    {
        $referIndex = intval($referIndex);
        return max(0, $referIndex + 1);
    }

    /**
     * 批量引用质检（不查库版）：\n
     * - 输入文章分节内容（t_article_main 的 content 列表）\n
     * - 输入引用条目（production_article_refer 的行列表）\n
     * - 自动抽取每个 [n] 的英文句子上下文，并映射到 refer.index=n-1 后调用 qcCitation()\n
     *
     * 说明：本方法不做任何数据库查询，方便你在 controller/service 中自由组合数据来源。
     *
     * @param array $articleMainContents 文章内容片段数组（按 sort 顺序），元素为 string 或含 content 的数组
     * @param array $referRows production_article_refer 行数组（至少含 index/title/author/joura/dateno/refer_doi/doilink）
     * @param array $options 透传给 qcCitation 的 options，并支持：
     *  - sentence_window(int) 上下文句子窗口，默认 1（即前1句+本句+后1句）
     * @return array 结果列表，每条包含 citation_mark/refer_index/context/ref_meta/qc
     */
    public function qcArticleCitations(array $articleMainContents, array $referRows, array $options = []): array
    {
        $window = isset($options['sentence_window']) ? max(0, intval($options['sentence_window'])) : 1;

        // 1) 组装全文纯文本（保留 [n]）
        $chunks = [];
        foreach ($articleMainContents as $row) {
            if (is_array($row)) {
                $text = (string)($row['content'] ?? '');
            } else {
                $text = (string)$row;
            }
            if ($text === '') continue;
            // 去掉常见标签，保留 [n]
            $text = preg_replace('/<\s*\/?\s*blue[^>]*>/i', '', $text);
            $text = strip_tags($text);
            $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
            $text = preg_replace('/\s+/u', ' ', trim($text));
            if ($text !== '') $chunks[] = $text;
        }
        $fullText = implode("\n", $chunks);

        if ($fullText === '') return [];

        // 2) 构建引用条目映射：refer_index => row
        $referMap = [];
        foreach ($referRows as $r) {
            if (!is_array($r)) continue;
            if (!isset($r['index'])) continue;
            $idx = intval($r['index']);
            $referMap[$idx] = $r;
        }

        // 3) 英文切句（简单稳健版）
        $sentences = $this->splitEnglishSentences($fullText);
        if (empty($sentences)) return [];

        // 4) 遍历句子，抓取其中的 [n]
        $results = [];
        foreach ($sentences as $si => $sent) {
            if (!preg_match_all('/\[(\d+)\]/', $sent, $m)) {
                continue;
            }
            $marks = array_unique(array_map('intval', $m[1]));
            foreach ($marks as $citationMark) {
                if ($citationMark <= 0) continue;
                $referIndex = $this->referIndexFromCitationMark($citationMark);
                if (!isset($referMap[$referIndex])) {
                    continue;
                }

                $start = max(0, $si - $window);
                $end = min(count($sentences) - 1, $si + $window);
                $ctx = implode(' ', array_slice($sentences, $start, $end - $start + 1));
                $ctx = trim(preg_replace('/\s+/u', ' ', $ctx));

                $refMeta = $referMap[$referIndex];
                $qc = $this->qcCitation($ctx, $refMeta, $options);

                $results[] = [
                    'citation_mark' => $citationMark,     // 正文编号 n（来自 [n]）
                    'refer_index'   => $referIndex,       // production_article_refer.index
                    'context'       => $ctx,
                    'ref_meta'      => [
                        'p_refer_id' => $refMeta['p_refer_id'] ?? 0,
                        'title'      => $refMeta['title'] ?? '',
                        'author'     => $refMeta['author'] ?? '',
                        'joura'      => $refMeta['joura'] ?? '',
                        'dateno'     => $refMeta['dateno'] ?? '',
                        'refer_doi'  => $refMeta['refer_doi'] ?? '',
                        'doilink'    => $refMeta['doilink'] ?? '',
                        'index'      => $refMeta['index'] ?? $referIndex,
                    ],
                    'qc'            => $qc,
                ];
            }
        }

        return $results;
    }

    /**
     * 过滤非法DOI（仅保留10.xxxx/xxx格式）
     * @param string $doi
     * @return string
     */
    public function filterValidDoi($doi = '')
    {
        $doi = trim((string)$doi);
        if ($doi === '') return '';
        if (preg_match('/^10\.\d{4,}\/.+/', $doi)) {
            return $doi;
        }
        return '';
    }

    /**
     * 获取 Crossref message（带重试）
     * @param string $doi
     * @return array|null
     */
    public function fetchWork($doi)
    {
        $doi = $this->filterValidDoi($doi);
        if ($doi === '') return null;
        return $this->fetchSingleDoiWithRetry($doi);
    }

    /**
     * 返回常用字段集合（标题/期刊/作者/卷期页/撤稿/URL）
     * @param string $doi
     * @return array|null
     */
    public function fetchWorkSummary($doi)
    {
        $msg = $this->fetchWork($doi);
        if (!$msg) return null;

        $title = $this->getTitle($msg);
        $publisher = $this->getPublisher($msg);
        $joura = !empty($publisher['title']) ? $publisher['title'] : ($publisher['short_title'] ?? '');
        $authors = $this->getAuthors($msg);
        $dateno = $this->getVolumeIssuePages($msg);
        $retractInfo = $this->checkRetracted($msg);
        $dolink = $this->getDolink($msg);
        if (empty($dolink)) {
            $dolink = 'https://doi.org/' . $this->filterValidDoi($doi);
        }

        return [
            'doi' => $this->filterValidDoi($doi),
            'title' => $title,
            'joura' => $joura,
            'publisher' => $publisher,
            'authors' => $authors,
            'author_str' => empty($authors) ? '' : implode(',', $authors),
            'dateno' => $dateno,
            'is_retracted' => !empty($retractInfo['is_retracted']) ? 1 : 0,
            'retract_reason' => $retractInfo['reason'] ?? '',
            'doilink' => $dolink,
            'raw' => $msg,
        ];
    }

    /**
     * 单DOI查询（带重试）
     * @param string $doi
     * @return array|null
     */
    private function fetchSingleDoiWithRetry($doi)
    {
        $retryCount = 0;
        while ($retryCount < $this->maxRetry) {
            $url = $this->crossrefUrl . rawurlencode($doi);
            if (!empty($this->mailto)) {
                $url .= "?mailto=" . rawurlencode($this->mailto);
            }

            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
            curl_setopt($ch, CURLOPT_HTTPHEADER, [
                "User-Agent: DOI-Fetcher/1.0 (mailto:{$this->mailto})"
            ]);
            $response = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            curl_close($ch);

            if ($httpCode == 200) {
                $data = json_decode($response, true);
                return (isset($data['status']) && $data['status'] == 'ok') ? ($data['message'] ?? null) : null;
            }

            if ($httpCode == 429) {
                sleep(5);
                $retryCount++;
                continue;
            }

            $retryCount++;
            sleep(1);
        }

        return null;
    }

    /**
     * 提取标题
     */
    public function getTitle($aDoiInfo = [])
    {
        return $aDoiInfo['title'][0] ?? '';
    }

    /**
     * 提取期刊/出版社相关信息
     */
    public function getPublisher($aDoiInfo = [])
    {
        return [
            'title'       => isset($aDoiInfo['container-title'][0]) ? $aDoiInfo['container-title'][0] : '',
            'short_title' => isset($aDoiInfo['short-container-title'][0]) ? $aDoiInfo['short-container-title'][0] : '',
            'ISSN'        => $aDoiInfo['ISSN'] ?? [],
            'publisher'   => $aDoiInfo['publisher'] ?? '',
        ];
    }

    /**
     * 提取作者列表
     */
    public function getAuthors($aDoiInfo = [])
    {
        $authors = [];
        if (!empty($aDoiInfo['author'])) {
            foreach ($aDoiInfo['author'] as $author) {
                $name = $author['family'] ?? '';
                if (!empty($author['given'])) {
                    $name = $author['given'] . ' ' . $name;
                }
                if (!empty($name)) {
                    $authors[] = $name;
                }
            }
        }
        return $authors;
    }

    /**
     * 提取发表年份
     */
    public function getPublishYear($aDoiInfo = [])
    {
        if (!empty($aDoiInfo['issued']['date-parts'][0][0])) {
            return (string)$aDoiInfo['issued']['date-parts'][0][0];
        }
        return '';
    }

    /**
     * 提取卷(期):起始页-终止页（格式：2024:10(2):100-120）
     */
    public function getVolumeIssuePages($aDoiInfo = [])
    {
        $parts = [];

        $year = $this->getPublishYear($aDoiInfo);
        if ($year) $parts[] = $year;

        $volume = $aDoiInfo['volume'] ?? '';
        $issue  = $aDoiInfo['issue'] ?? '';
        if ($volume) {
            $parts[] = $volume . ($issue ? "({$issue})" : '');
        }

        $pageStart = $aDoiInfo['page']['start'] ?? ($aDoiInfo['first-page'] ?? '');
        $pageEnd   = $aDoiInfo['page']['end'] ?? ($aDoiInfo['last-page'] ?? '');
        $pages = '';
        if ($pageStart) {
            $pages = $pageStart . ($pageEnd ? "-{$pageEnd}" : '');
        } else {
            $pages = $aDoiInfo['page'] ?? '';
        }
        if ($pages) $parts[] = $pages;

        return implode(':', $parts);
    }

    /**
     * 识别撤稿文章（与 Crossrefdoi.php 同逻辑）
     */
    public function checkRetracted($aDoiInfo = [])
    {
        $isRetracted = false;
        $reason = "未撤稿";

        $sType = strtolower($aDoiInfo['type'] ?? '');
        $sSubtype = strtolower($aDoiInfo['subtype'] ?? '');
        if ($sType && in_array($sType, ['retraction', 'correction'])) {
            $isRetracted = true;
            $reason = "文章类型为{$sType}（撤稿/更正声明）";
        }
        if ($sSubtype && in_array($sSubtype, ['retraction', 'correction'])) {
            $isRetracted = true;
            $reason = "文章类型为{$sSubtype}（撤稿/更正声明）";
        }

        if (isset($aDoiInfo['update-type']) && is_array($aDoiInfo['update-type']) && in_array('retraction', $aDoiInfo['update-type'])) {
            $isRetracted = true;
            $reason = "官方标记为撤稿（update-type: retraction）";
        }

        if (isset($aDoiInfo['relation']) && !empty($aDoiInfo['relation'])) {
            foreach ($aDoiInfo['relation'] as $relType => $relItems) {
                if (in_array($relType, ['is-retraction-of', 'corrects'])) {
                    $isRetracted = true;
                    $relatedDoi = $relItems[0]['id'] ?? '未知';
                    $reason = "关联撤稿文章{$relatedDoi}（关系：{$relType}）";
                    break;
                }
            }
        }

        if (isset($aDoiInfo['update-to']) && is_array($aDoiInfo['update-to'])) {
            foreach ($aDoiInfo['update-to'] as $update) {
                $updateType  = strtolower($update['type'] ?? '');
                $updateLabel = strtolower($update['label'] ?? '');
                if (strpos($updateType, 'retract') !== false || strpos($updateLabel, 'retract') !== false) {
                    $isRetracted = true;
                    $reason = "update-to 标记撤稿（{$updateType}/{$updateLabel}）";
                    break;
                }
            }
        }

        $aTitles = $aDoiInfo['title'] ?? [];
        foreach ($aTitles as $value) {
            $sTitleLower = strtolower($value);
            if (strpos($sTitleLower, 'retraction') !== false || strpos($sTitleLower, 'retracted') !== false
                || strpos($sTitleLower, 'withdrawal') !== false || strpos($sTitleLower, 'withdrawn') !== false) {
                $isRetracted = true;
                $reason = "标题包含撤稿关键词";
                break;
            }
        }

        return [
            'is_retracted' => $isRetracted,
            'reason' => $reason,
        ];
    }

    /**
     * 识别 doi 链接
     */
    public function getDolink($aDoiInfo = [])
    {
        return $aDoiInfo['URL'] ?? '';
    }

    /**
     * 解析 Crossref date-parts
     */
    public function parseDateParts($dateObj)
    {
        $parts = $dateObj['date-parts'][0] ?? [];
        if (empty($parts)) return '';
        $y = $parts[0] ?? '';
        $m = $parts[1] ?? '';
        $d = $parts[2] ?? '';
        $out = (string)$y;
        if ($m !== '') $out .= '-' . str_pad((string)$m, 2, '0', STR_PAD_LEFT);
        if ($d !== '') $out .= '-' . str_pad((string)$d, 2, '0', STR_PAD_LEFT);
        return $out;
    }

    /**
     * 引用质检：判断(1) 被引条目是否疑似退稿/更正(基于 Crossref)；(2) 引用上下文是否与被引条目相关(基于证据命中)。
     *
     * 说明：
     * - 适用于没有 abstract/keywords 的场景（仅用 title/author/journal/year + 引用上下文句子）。
     * - 如果 refer_doi/doilink 为空，则 problem_flag 只能返回 unknown。
     *
     * @param string $contextText 引用处的上下文句子（英文，最好只包含引用所在句 + 少量相邻句）
     * @param array $refMeta 被引条目元信息（建议来自 production_article_refer）
     *  - refer_doi / doilink / title / author / joura / dateno
     * @param array $options 可选参数
     *  - check_retraction(bool) 是否调用 Crossref 判断退稿/更正；默认 true
     *  - background_phrases(array) 背景堆引用触发短语；默认使用内置
     *
     * @return array
     *  [
     *    'problem_flag' => 'ok'|'retracted_or_corrected'|'unknown',
     *    'problem_reason' => string,
     *    'relevance_flag' => 'related'|'unsure'|'unsure_background'|'suspicious_unrelated',
     *    'relevance_score' => float,
     *    'reason' => string
     *  ]
     */
    public function qcCitation(string $contextText, array $refMeta, array $options = []): array
    {
        $contextText = trim($contextText);
        $checkRetraction = isset($options['check_retraction']) ? (bool)$options['check_retraction'] : true;

        $refTitle = (string)($refMeta['title'] ?? '');
        $refAuthor = (string)($refMeta['author'] ?? '');
        $refJoura = (string)($refMeta['joura'] ?? '');
        $refDateno = (string)($refMeta['dateno'] ?? '');
        $referDoi = (string)($refMeta['refer_doi'] ?? '');
        $doilink = (string)($refMeta['doilink'] ?? '');

        $doi = $this->extractDoiFromMeta($referDoi, $doilink);

        // 1) 退稿/更正判断（强规则，影响 problem_flag）
        $problemFlag = 'unknown';
        $problemReason = '';
        if ($checkRetraction) {
            if (!empty($doi)) {
                $summary = $this->fetchWorkSummary($doi);
                if ($summary && isset($summary['is_retracted'])) {
                    if ((int)$summary['is_retracted'] === 1) {
                        $problemFlag = 'retracted_or_corrected';
                        $problemReason = !empty($summary['retract_reason']) ? $summary['retract_reason'] : 'Crossref indicates retraction/correction';
                    } else {
                        $problemFlag = 'ok';
                        $problemReason = 'Crossref indicates not retracted/corrected';
                    }
                } else {
                    $problemFlag = 'unknown';
                    $problemReason = 'Crossref fetch failed or returned unexpected data';
                }
            } else {
                $problemFlag = 'unknown';
                $problemReason = 'DOI is empty';
            }
        } else {
            $problemFlag = 'unknown';
            $problemReason = 'Skip retraction check';
        }

        // 2) 相关性判断（弱规则+证据命中）
        $backgroundPhrases = isset($options['background_phrases']) ? (array)$options['background_phrases'] : [
            'several studies',
            'many studies',
            'the literature',
            'the existing literature',
            'has been reported',
            'have been reported',
            'it has been shown',
            'previous studies',
            'the study suggests',
            'the literature suggests',
            'in the literature',
        ];

        $ctxLower = strtolower($contextText);
        $isBackground = false;
        foreach ($backgroundPhrases as $ph) {
            $ph = strtolower(trim((string)$ph));
            if ($ph !== '' && $ph !== '0' && strpos($ctxLower, $ph) !== false) {
                $isBackground = true;
                break;
            }
        }

        $refTokens = $this->buildEvidenceTokens([
            'title' => $refTitle,
            'author' => $refAuthor,
            'journal' => $refJoura,
            'year' => $refDateno,
        ]);

        $ctxTokens = $this->tokenize($contextText);

        $titleOverlap = 0.0;
        $authorHit = 0.0;
        $journalOverlap = 0.0;
        $yearHit = 0.0;

        $titleTokens = $refTokens['titleTokens'] ?? [];
        $authorTokens = $refTokens['authorTokens'] ?? [];
        $journalTokens = $refTokens['journalTokens'] ?? [];
        $yearToken = $refTokens['yearToken'] ?? '';

        if (!empty($titleTokens)) {
            $inter = array_intersect($titleTokens, $ctxTokens);
            $titleOverlap = count($inter) / max(1, count($titleTokens));
        }

        if (!empty($authorTokens)) {
            foreach ($authorTokens as $at) {
                if ($at !== '' && in_array($at, $ctxTokens, true)) {
                    $authorHit = 1.0;
                    break;
                }
            }
        }

        if (!empty($journalTokens)) {
            $interJ = array_intersect($journalTokens, $ctxTokens);
            $journalOverlap = count($interJ) / max(1, count($journalTokens));
        }

        if (!empty($yearToken) && strpos($ctxLower, (string)$yearToken) !== false) {
            $yearHit = 1.0;
        }

        // 综合得分（保持解释性：越高越相关）
        $score = round((
            0.60 * $titleOverlap +
            0.20 * $authorHit +
            0.15 * $yearHit +
            0.05 * $journalOverlap
        ), 4);

        $relevanceFlag = 'unsure';
        $reasonParts = [];

        if ($score >= 0.35 && ($authorHit > 0.0 || $yearHit > 0.0)) {
            $relevanceFlag = 'related';
            $reasonParts[] = 'title_keyword_overlap_high=' . $titleOverlap;
        } elseif ($score >= 0.25) {
            $relevanceFlag = 'unsure';
            $reasonParts[] = 'evidence_score_mid=' . $score;
        } else {
            if ($isBackground) {
                $relevanceFlag = 'unsure_background';
                $reasonParts[] = 'background_phrases_detected';
            } else {
                $relevanceFlag = 'suspicious_unrelated';
                $reasonParts[] = 'evidence_score_low=' . $score;
            }
        }

        $reasonParts[] = 'titleOverlap=' . $titleOverlap;
        $reasonParts[] = 'authorHit=' . $authorHit;
        $reasonParts[] = 'yearHit=' . $yearHit;
        $reasonParts[] = 'journalOverlap=' . $journalOverlap;

        $reason = implode('; ', $reasonParts);

        return [
            'problem_flag' => $problemFlag,
            'problem_reason' => $problemReason,
            'relevance_flag' => $relevanceFlag,
            'relevance_score' => (float)$score,
            'reason' => $reason,
        ];
    }

    /**
     * 从 refer_doi / doilink 中抽取 DOI 字符串。
     * @param string $referDoi
     * @param string $doilink
     * @return string
     */
    private function extractDoiFromMeta(string $referDoi, string $doilink): string
    {
        $doi = trim($referDoi);
        if (!empty($doi)) {
            return $this->filterValidDoi($doi);
        }

        $link = trim($doilink);
        if ($link === '') return '';

        // 常见：https://doi.org/10.xxxx/xxxx 或 http://doi.org/...
        if (preg_match('#doi\.org/([^?#]+)#i', $link, $m)) {
            $candidate = trim((string)$m[1]);
            return $this->filterValidDoi($candidate);
        }

        // 兜底：如果doilink本身就是doi格式
        return $this->filterValidDoi($link);
    }

    /**
     * 构建证据 token（用于证据命中/相似度粗判）
     * @param array $src
     * @return array
     */
    private function buildEvidenceTokens(array $src): array
    {
        $stop = [
            'the','a','an','and','or','of','in','on','for','with','to','from','by','at','as','is','are',
            'was','were','be','been','being','that','this','these','those','which','who','whom','it','its',
            'we','our','us','they','their','them','i','you','your','he','she','his','her',
            'study','studies','report','reported','reports','model','models','analysis','analyses','method','methods',
            'results','result','using','used','show','shown','demonstrated','demonstrate',
        ];

        $titleTokens = $this->tokenize((string)($src['title'] ?? ''));
        $titleTokens = array_values(array_filter(array_unique($titleTokens), function ($t) use ($stop) {
            return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
        }));

        $authorTokens = $this->extractAuthorTokens((string)($src['author'] ?? ''));
        $authorTokens = array_values(array_unique(array_filter(array_map(function ($t) use ($stop) {
            $t = trim($t);
            if ($t === '') return '';
            if (in_array($t, $stop, true)) return '';
            return $t;
        }, $authorTokens))));

        $journalTokens = $this->tokenize((string)($src['journal'] ?? ''));
        $journalTokens = array_values(array_filter(array_unique($journalTokens), function ($t) use ($stop) {
            return !in_array($t, $stop, true) && mb_strlen($t) >= 4;
        }));

        $yearToken = '';
        $yearRaw = (string)($src['year'] ?? '');
        if (preg_match('/(19\d{2}|20\d{2})/', $yearRaw, $m)) {
            $yearToken = (string)$m[1];
        }

        return [
            'titleTokens' => $titleTokens,
            'authorTokens' => $authorTokens,
            'journalTokens' => $journalTokens,
            'yearToken' => $yearToken,
        ];
    }

    /**
     * 提取作者姓/缩写 token（简化版）
     * @param string $authorStr
     * @return array
     */
    private function extractAuthorTokens(string $authorStr): array
    {
        $authorStr = trim($authorStr);
        if ($authorStr === '') return [];

        // 把常见分隔符拆开
        $parts = preg_split('/[,;]| and /i', $authorStr);
        $tokens = [];
        foreach ($parts as $p) {
            $p = trim((string)$p);
            if ($p === '') continue;

            // 取最后一个词当作姓（例如 "Smith J" -> "Smith"），或取首段词
            $words = preg_split('/\s+/', $p);
            if (empty($words)) continue;

            $cand = trim((string)end($words));
            if ($cand === '') $cand = trim((string)($words[0] ?? ''));

            // 只保留字母/点号（去掉异常符号）
            $cand = preg_replace('/[^A-Za-z\.\-]/', '', $cand);
            $cand = strtolower($cand);
            if ($cand !== '') {
                $tokens[] = $cand;
            }
        }

        // 去掉过短的 token
        $tokens = array_values(array_filter(array_unique($tokens), function ($t) {
            return mb_strlen($t) >= 4;
        }));

        return $tokens;
    }

    /**
     * 文本 tokenize（英文下的轻量分词）
     * @param string $text
     * @return array
     */
    private function tokenize(string $text): array
    {
        $text = strtolower(trim($text));
        if ($text === '') return [];

        $parts = preg_split('/[^a-z0-9]+/i', $text);
        $tokens = [];
        foreach ($parts as $p) {
            $p = trim((string)$p);
            if ($p === '') continue;
            // 保留较有信息量的 token
            if (mb_strlen($p) < 3) continue;
            $tokens[] = $p;
        }

        return array_values(array_unique($tokens));
    }

    /**
     * 英文切句（轻量实现）：按 .?! 分割，同时保留句内的 [n]。
     * @param string $text
     * @return array
     */
    private function splitEnglishSentences(string $text): array
    {
        $text = trim($text);
        if ($text === '') return [];

        // 先把换行统一为空格，避免断句被打断
        $text = preg_replace('/\s+/u', ' ', $text);

        // 按句末标点断句：. ? ! 后面跟空格/结尾
        $parts = preg_split('/(?<=[\.\?\!])\s+/', $text);
        $sentences = [];
        foreach ($parts as $p) {
            $p = trim((string)$p);
            if ($p === '') continue;
            $sentences[] = $p;
        }
        return $sentences;
    }
}