修改自动推广的相关任务

2026-05-13 12:26:28 +08:00
parent c36eba77b1
commit fa878334cd
7 changed files with 289 additions and 29 deletions
--- a/.env
+++ b/.env
@@ -33,6 +33,9 @@ UNSUBSCRIBE_BASE_URL=https://submission.tmrjournals.com/api/Unsubscribe/index
 [yboard]
 APPLY_URL="https://submission.tmrjournals.com/youthBoardRegister"
 [plagiarism]
 static_root="/home/wwwroot/api.tmrjournals.com/public"
 [journal]
 ;官网服务器地址
 base_url = http://journalapi.tmrjournals.com/public/index.php
--- a/application/api/controller/Plagiarism.php
+++ b/application/api/controller/Plagiarism.php
@@ -51,7 +51,7 @@ class Plagiarism extends Base
            $localPath = $fileUrl !== ''
                ? $svc->resolveFileUrlToLocal($fileUrl)
                : $svc->locateArticleManuscript($articleId);
-
+            echo $localPath;
            $checkId = $svc->submit($articleId, $localPath, $editorId, 'manual');
            return jsonSuccess(['check_id' => $checkId]);
        } catch (\Throwable $e) {
@@ -59,6 +59,15 @@ class Plagiarism extends Base
        }
    }
    public function testccone(){
        $svc = new PlagiarismService();
        $checkId = 9;
        $filePath = "/home/wwwroot/api.tmrjournals.com/public/manuscirpt/20260509/6832a56e8ace38fe99df390ab5221deb.docx";
        $svc->runUploadAndTrigger($checkId,$filePath);
    }
    /**
     * 重试 = 提交一次新查重（保留历史）
     */
--- a/application/api/controller/Preaccept.php
+++ b/application/api/controller/Preaccept.php
@@ -6,6 +6,7 @@ use think\Db;
 use think\Env;
 use think\Queue;
 use think\Validate;
 use app\common\CrossrefService;
 class Preaccept extends Base
 {
@@ -708,36 +709,66 @@ class Preaccept extends Base
    }
    /**
     * 通过 DOI 获取文献元数据（Crossref REST API）。
     *
     * POST 参数:
     *   doi  必填，可为纯 DOI（10.xxxx/...）或 https://doi.org/10.xxxx/...
     *
     * 返回 data.formate 与旧版字段兼容: author, title, joura, dateno, doilink
     * 另附 data.crossref: 原始摘要字段（不含 raw message，避免体积过大）
     */
    public function searchDoi()
    {
        $data = $this->request->post();
        $rule = new Validate([
-            "doi" => "require"
+            'doi' => 'require',
        ]);
        if (!$rule->check($data)) {
            return jsonError($rule->getError());
        }
-        $doi = str_replace('/', '%2F', $data['doi']);
+
-//        $url = "https://citation.crosscite.org/format?doi=$doi&style=cancer-translational-medicine&lang=en-US";
+        $doiInput = trim((string)$data['doi']);
-        $url = "https://citation.doi.org/format?doi=$doi&style=cancer-translational-medicine&lang=en-US";
+        if ($doiInput === '') {
-        $res = myGet($url);
+            return jsonError('doi empty');
        $frag = trim(substr($res, strpos($res, '.') + 1));
        if ($frag == "") {
            return jsonError("not find");
        }
-        if (mb_substr_count($frag, '.') != 3) {
+        // 去掉 URL 前缀，得到裸 DOI
-            return jsonError("formate fail");
+        $doiNorm = preg_replace('#^https?://(dx\.)?doi\.org/#i', '', $doiInput);
        $doiNorm = trim($doiNorm, " \t\n\r\0\x0B/");
        $svc = new CrossrefService([
            'mailto' => trim((string)Env::get('crossref_mailto', '')),
        ]);
        $summary = $svc->fetchWorkSummary($doiNorm);
        if ($summary === null || empty($summary['doi'])) {
            return jsonError('DOI not found or invalid (Crossref)');
        }
-        $res = explode('.', $frag);
+
-        $f['author'] = prgeAuthor($res[0]);
+        $title = trim((string)($summary['title'] ?? ''));
-        $f['title'] = trim($res[1]);
+        $jouraRaw = trim((string)($summary['joura'] ?? ''));
-        $bj = bekjournal($res[2]);
+        $authorStr = trim((string)($summary['author_str'] ?? ''));
-        $joura = formateJournal(trim($bj[0]));
+        $dateno = trim((string)($summary['dateno'] ?? ''));
-        $f['joura'] = $joura;
+        $doilink = trim((string)($summary['doilink'] ?? ''));
-        $f['dateno'] = str_replace(' ', '', str_replace('-', '–', trim($bj[1])));
+        if ($doilink === '') {
-        $f['doilink'] = strpos($data['doi'], "http") === false ? "http://doi.org/" . $data['doi'] : $data['doi'];
+            $doilink = 'https://doi.org/' . $summary['doi'];
-        $re['formate'] = $f;
+        }
-        return jsonSuccess($re);
+
        $f = [
            'author'  => $authorStr !== '' ? prgeAuthor($authorStr) : '',
            'title'   => $title,
            'joura'   => $jouraRaw !== '' ? formateJournal($jouraRaw) : '',
            'dateno'  => str_replace(' ', '', str_replace('-', '–', $dateno)),
            'doilink' => $doilink,
        ];
        $crossrefOut = $summary;
        unset($crossrefOut['raw']);
        return jsonSuccess([
            'formate'   => $f,
            'crossref'  => $crossrefOut,
            'doi'       => $summary['doi'],
        ]);
    }
--- a/application/api/controller/Production.php
+++ b/application/api/controller/Production.php
@@ -10,6 +10,7 @@ use think\Db;
 use think\Queue;
 use think\Validate;
 use think\log;
 use app\common\ArticleSymbolNormalizer;
 /**
 * @title 公共管理相关
@@ -1380,6 +1381,10 @@ class Production extends Base
        return $html;
    }
    public function testsym(){
        ArticleSymbolNormalizer::normalize("");
    }
    public function doTypeSettingNew()
    {
@@ -1399,7 +1404,7 @@ class Production extends Base
        $editor_info = $this->user_obj->where('user_id', $journal_info['editor_id'])->find();
        $typesetInfo = [];
-        $typesetInfo['info_title'] = $p_info['title'];
+        $typesetInfo['info_title'] = ArticleSymbolNormalizer::normalize($p_info['title']);
        $typesetInfo['info_type'] = $p_info['type'];
        $typesetInfo['doi'] = $p_info['doi'];
        $typesetInfo['topic'] = '';
--- a/application/api/job/PlagiarismRun.php
+++ b/application/api/job/PlagiarismRun.php
@@ -29,6 +29,7 @@ class PlagiarismRun
            return;
        }
        $svc = new PlagiarismService();
        $svc->log("PlagiarismRun job act!!");
        $svc->runUploadAndTrigger($checkId, $filePath);
        $job->delete();
    }
--- a/application/common/ArticleSymbolNormalizer.php
+++ b/application/common/ArticleSymbolNormalizer.php
@@ -0,0 +1,194 @@
 <?php
 namespace app\common;
 /**
 * 期刊文章内容「符号层」校对：只调整标点、空白、全角半角等，不增删语义文字。
 *
 * 设计原则：
 * - 默认规则保守，可通过 $options 逐项关闭；
 * - 纯文本用 normalize()；含 HTML 时用 normalizeHtml()（仅处理标签之间的文本段，避免破坏属性里的 URL）。
 * - Abstract 常用：存储时被转义为 &gt; &lt; &amp; 等，可用 normalizeAbstract() 先解码再符号校对。
 * - 英文期刊正文/摘要通常不含中文：设 english_journal=true（或 normalizeEnglishAbstract）可关闭仅针对汉字的规则。
 */
 class ArticleSymbolNormalizer
 {
    /** @var string 常用汉字 BMP 段（含扩展 A 前部，足够覆盖正文） */
    private static $han = '\x{4E00}-\x{9FFF}\x{3400}-\x{4DBF}';
    /**
     * 纯文本符号校对。
     *
     * @param string $text
     * @param array  $options 可选键（均为 bool，默认 true）：
     *   - line_endings        CRLF / CR → LF
     *   - fullwidth_space     U+3000 全角空格 → 普通空格
     *   - collapse_spaces     连续半角空格（不含换行）压成单个空格
     *   - remove_zwsp         删除零宽空格等不可见格式字符（不改变可见字）
     *   - comma_cjk           两个汉字之间的英文逗号「,」→「，」
     *   - comma_latin         两个 ASCII 字母/数字之间的全角逗号「，」→「,」
     *   - period_cjk          汉字后的全角句点「．」(U+FF0E) →「。」
     *   - bracket_latin       仅由 ASCII 标识包裹时「（）」→「()」（如 (a) 类简单情形，保守：仅当括号内全为 ASCII）
     *   - decode_html_entities 将 &gt; &lt; &amp; &quot; &#39; 及数字实体等转为真实字符（默认 false；abstract 见 normalizeAbstract）
     *   - english_journal      英文期刊：关闭「两汉字间英文逗号→，」「汉字后 FF0E→。」等中文专用规则（默认 false；见 normalizeEnglishAbstract）
     *
     * @return string
     */
    public static function normalize($text, array $options = [])
    {
        $text = (string)$text;
        if ($text === '') {
            return '';
        }
        $o = array_merge([
            'line_endings'         => true,
            'fullwidth_space'      => true,
            'collapse_spaces'      => true,
            'remove_zwsp'          => true,
            'comma_cjk'            => true,
            'comma_latin'          => true,
            'period_cjk'           => true,
            'bracket_latin'        => false,
            'decode_html_entities' => false,
            'english_journal'      => false,
        ], $options);
        if (!empty($o['english_journal'])) {
            if (!array_key_exists('comma_cjk', $options)) {
                $o['comma_cjk'] = false;
            }
            if (!array_key_exists('period_cjk', $options)) {
                $o['period_cjk'] = false;
            }
        }
        if (!empty($o['decode_html_entities'])) {
            $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
        }
        if (!empty($o['line_endings'])) {
            $text = str_replace(["\r\n", "\r"], "\n", $text);
        }
        if (!empty($o['fullwidth_space'])) {
            $text = str_replace("\u{3000}", ' ', $text);
        }
        if (!empty($o['remove_zwsp'])) {
            // 零宽空格、零宽非断空格、BOM、软连字符等（不改变可见字符）
            $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}\x{00AD}]/u', '', $text);
        }
        if (!empty($o['collapse_spaces'])) {
            $text = preg_replace('/[ \t]{2,}/u', ' ', $text);
        }
        $han = self::$han;
        if (!empty($o['comma_cjk'])) {
            // 汉字 , 汉字 → 汉字 ， 汉字
            $text = preg_replace('/(?<=[' . $han . ']),(?=[' . $han . '])/u', '，', $text);
        }
        if (!empty($o['comma_latin'])) {
            // 字母/数字 ， 字母/数字 → ,
            $text = preg_replace('/(?<=[0-9A-Za-z])，(?=[0-9A-Za-z])/u', ',', $text);
        }
        if (!empty($o['period_cjk'])) {
            // 汉字后的全角英文句点 FF0E → 中文句号 。
            $text = preg_replace('/(?<=[' . $han . '])．/u', '。', $text);
        }
        if (!empty($o['bracket_latin'])) {
            // （ 仅 ASCII + 常见标点 + 空格 ）
            $text = preg_replace_callback(
                '/（([0-9A-Za-z\s\.,;:\-\+/=]+)）/u',
                static function ($m) {
                    return '(' . $m[1] . ')';
                },
                $text
            );
        }
        return $text;
    }
    /**
     * 对 HTML 片段做符号校对：只替换「标签外」的文本，不修改标签名与属性值。
     *
     * 实现：按 `<...>` 切分，对偶数段（文本）调用 normalize()，奇数段（标签）原样保留。
     * 注意：畸形 HTML、属性值中含未转义 `<` 时可能误判，复杂场景请先抽纯文本再校对。
     *
     * @param string $html
     * @param array  $options 同 normalize()
     * @return string
     */
    public static function normalizeHtml($html, array $options = [])
    {
        $html = (string)$html;
        if ($html === '') {
            return '';
        }
        $parts = preg_split('/(<[^>]*>)/u', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
        if ($parts === false) {
            return self::normalize($html, $options);
        }
        $out = '';
        foreach ($parts as $i => $chunk) {
            if ($chunk === '') {
                continue;
            }
            // 偶数索引为文本，奇数索引且以 < 开头为标签
            if ($i % 2 === 1 && isset($chunk[0]) && $chunk[0] === '<') {
                $out .= $chunk;
            } else {
                $out .= self::normalize($chunk, $options);
            }
        }
        return $out;
    }
    /**
     * Abstract 专用：先 HTML 实体解码（&gt; → > 等），再执行与普通正文相同的符号校对。
     *
     * 适用于摘要字段在库中/接口中以 htmlspecialchars 形式存储的场景。
     * 若摘要内本身含真实 HTML 标签且需保留标签结构，请改用 normalizeHtml() 并自行传入 decode_html_entities。
     *
     * @param string $abstract
     * @param array  $options 同 normalize()，默认会合并 decode_html_entities=true（可被显式 false 覆盖）
     * @return string
     */
    public static function normalizeAbstract($abstract, array $options = [])
    {
        $opts = array_merge(['decode_html_entities' => true], $options);
        return self::normalize($abstract, $opts);
    }
    /**
     * 带 HTML 标签的摘要：仅在「标签外文本」中做实体解码 + 符号校对，不改动标签与属性。
     *
     * @param string $html
     * @param array  $options 同 normalize()，默认 decode_html_entities=true
     * @return string
     */
    public static function normalizeAbstractHtml($html, array $options = [])
    {
        $opts = array_merge(['decode_html_entities' => true], $options);
        return self::normalizeHtml($html, $opts);
    }
    /**
     * 英文期刊 Abstract：实体解码 + 符号校对，且默认关闭中文专用标点规则。
     */
    public static function normalizeEnglishAbstract($abstract, array $options = [])
    {
        return self::normalizeAbstract($abstract, array_merge(['english_journal' => true], $options));
    }
    /**
     * 英文期刊、带 HTML 的摘要（标签外文本）：实体解码 + 符号校对，且默认关闭中文专用规则。
     */
    public static function normalizeEnglishAbstractHtml($html, array $options = [])
    {
        return self::normalizeAbstractHtml($html, array_merge(['english_journal' => true], $options));
    }
 }
--- a/application/common/PlagiarismService.php
+++ b/application/common/PlagiarismService.php
@@ -34,6 +34,13 @@ class PlagiarismService
     */
    const MAX_POLL_ATTEMPTS = 60;
    private $logFile;
    public function __construct()
    {
        $this->logFile = ROOT_PATH . 'runtime' . DS . 'plagiarism_task.log';
    }
    // ---------- 顶层入口 ----------
    /**
@@ -55,6 +62,7 @@ class PlagiarismService
            ->where('article_id', $articleId)
            ->value('journal_id');
        $now = time();
        $checkId = Db::name('plagiarism_check')->insertGetId([
            'article_id'       => $articleId,
@@ -67,12 +75,12 @@ class PlagiarismService
            'ctime'            => $now,
            'utime'            => $now,
        ]);
-
+        $this->log("submit service act");
        // 入队执行：上传 + 触发 similarity
        Queue::push(
            'app\\api\\job\\PlagiarismRun',
            ['check_id' => $checkId, 'file_path' => $filePath],
-            'plagiarism'
+            'PlagiarismRun'
        );
        return (int)$checkId;
@@ -84,7 +92,7 @@ class PlagiarismService
    public function runUploadAndTrigger($checkId, $filePath)
    {
        $check = $this->mustGetCheck($checkId);
-
+        $this->log("runUploadAndTrigger is act0");
        try {
            $tii = new TurnitinService();
@@ -95,7 +103,7 @@ class PlagiarismService
            if ($articleTitle === '') {
                $articleTitle = 'Article #' . $check['article_id'];
            }
-
+            $this->log("runUploadAndTrigger is act1");
            $createResp = $tii->createSubmission([
                'title'     => mb_substr($articleTitle, 0, 250),
                'owner'     => 'editor_' . $check['triggered_by'],
@@ -114,7 +122,7 @@ class PlagiarismService
                'tii_submission_id' => $submissionId,
                'raw_response'      => json_encode($createResp, JSON_UNESCAPED_UNICODE),
            ]);
-
+            $this->log("runUploadAndTrigger is act2");
            // 2. 上传文件
            $tii->uploadFile($submissionId, $filePath, basename($filePath));
@@ -127,12 +135,14 @@ class PlagiarismService
                'raw_response'      => json_encode($simResp, JSON_UNESCAPED_UNICODE),
            ]);
            $this->log("runUploadAndTrigger is act3");
            // 4. 排队首次轮询（晚一点开始，让 Turnitin 先处理）
            Queue::later(
                self::POLL_INTERVAL,
                'app\\api\\job\\PlagiarismPoll',
                ['check_id' => $checkId, 'attempt' => 1],
-                'plagiarism'
+                'PlagiarismPoll'
            );
        } catch (\Throwable $e) {
            $this->markFailed($checkId, '[upload] ' . $e->getMessage());
@@ -320,6 +330,7 @@ class PlagiarismService
    private function markFailed($checkId, $errMsg)
    {
        $this->log("markFailed act");
        $this->updateCheck($checkId, [
            'state'     => 4,
            'error_msg' => mb_substr($errMsg, 0, 1000),
@@ -337,7 +348,7 @@ class PlagiarismService
        $row = Db::name('article_file')
            ->where('article_id', $articleId)
            ->where('type_name', 'manuscirpt')   // 历史拼写
-            ->order('article_file_id desc')
+            ->order('file_id desc')
            ->find();
        if (!$row || empty($row['file_url'])) {
            throw new Exception("article #{$articleId} has no manuscirpt file");
@@ -420,4 +431,10 @@ class PlagiarismService
    {
        return Db::name('plagiarism_check')->where('check_id', $checkId)->find();
    }
    public function log($msg)
    {
        $line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL;
        @file_put_contents($this->logFile, $line, FILE_APPEND);
    }
 }