From fa878334cd151a29627aac8f2e01d8ce27770606 Mon Sep 17 00:00:00 2001 From: wangjinlei <751475802@qq.com> Date: Wed, 13 May 2026 12:26:28 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=87=AA=E5=8A=A8=E6=8E=A8?= =?UTF-8?q?=E5=B9=BF=E7=9A=84=E7=9B=B8=E5=85=B3=E4=BB=BB=E5=8A=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 3 + application/api/controller/Plagiarism.php | 11 +- application/api/controller/Preaccept.php | 71 +++++-- application/api/controller/Production.php | 7 +- application/api/job/PlagiarismRun.php | 1 + .../common/ArticleSymbolNormalizer.php | 194 ++++++++++++++++++ application/common/PlagiarismService.php | 31 ++- 7 files changed, 289 insertions(+), 29 deletions(-) create mode 100644 application/common/ArticleSymbolNormalizer.php diff --git a/.env b/.env index ad8120e4..2f67571d 100644 --- a/.env +++ b/.env @@ -33,6 +33,9 @@ UNSUBSCRIBE_BASE_URL=https://submission.tmrjournals.com/api/Unsubscribe/index [yboard] APPLY_URL="https://submission.tmrjournals.com/youthBoardRegister" +[plagiarism] +static_root="/home/wwwroot/api.tmrjournals.com/public" + [journal] ;官网服务器地址 base_url = http://journalapi.tmrjournals.com/public/index.php diff --git a/application/api/controller/Plagiarism.php b/application/api/controller/Plagiarism.php index 76cc0c1a..febfbde1 100644 --- a/application/api/controller/Plagiarism.php +++ b/application/api/controller/Plagiarism.php @@ -51,7 +51,7 @@ class Plagiarism extends Base $localPath = $fileUrl !== '' ? $svc->resolveFileUrlToLocal($fileUrl) : $svc->locateArticleManuscript($articleId); - + echo $localPath; $checkId = $svc->submit($articleId, $localPath, $editorId, 'manual'); return jsonSuccess(['check_id' => $checkId]); } catch (\Throwable $e) { @@ -59,6 +59,15 @@ class Plagiarism extends Base } } + + public function testccone(){ + $svc = new PlagiarismService(); + $checkId = 9; + $filePath = "/home/wwwroot/api.tmrjournals.com/public/manuscirpt/20260509/6832a56e8ace38fe99df390ab5221deb.docx"; + $svc->runUploadAndTrigger($checkId,$filePath); + + } + /** * 重试 = 提交一次新查重(保留历史) */ diff --git a/application/api/controller/Preaccept.php b/application/api/controller/Preaccept.php index 78fe1240..9b4867c7 100644 --- a/application/api/controller/Preaccept.php +++ b/application/api/controller/Preaccept.php @@ -6,6 +6,7 @@ use think\Db; use think\Env; use think\Queue; use think\Validate; +use app\common\CrossrefService; class Preaccept extends Base { @@ -708,36 +709,66 @@ class Preaccept extends Base } + /** + * 通过 DOI 获取文献元数据(Crossref REST API)。 + * + * POST 参数: + * doi 必填,可为纯 DOI(10.xxxx/...)或 https://doi.org/10.xxxx/... + * + * 返回 data.formate 与旧版字段兼容: author, title, joura, dateno, doilink + * 另附 data.crossref: 原始摘要字段(不含 raw message,避免体积过大) + */ public function searchDoi() { $data = $this->request->post(); $rule = new Validate([ - "doi" => "require" + 'doi' => 'require', ]); if (!$rule->check($data)) { return jsonError($rule->getError()); } - $doi = str_replace('/', '%2F', $data['doi']); -// $url = "https://citation.crosscite.org/format?doi=$doi&style=cancer-translational-medicine&lang=en-US"; - $url = "https://citation.doi.org/format?doi=$doi&style=cancer-translational-medicine&lang=en-US"; - $res = myGet($url); - $frag = trim(substr($res, strpos($res, '.') + 1)); - if ($frag == "") { - return jsonError("not find"); + + $doiInput = trim((string)$data['doi']); + if ($doiInput === '') { + return jsonError('doi empty'); } - if (mb_substr_count($frag, '.') != 3) { - return jsonError("formate fail"); + // 去掉 URL 前缀,得到裸 DOI + $doiNorm = preg_replace('#^https?://(dx\.)?doi\.org/#i', '', $doiInput); + $doiNorm = trim($doiNorm, " \t\n\r\0\x0B/"); + + $svc = new CrossrefService([ + 'mailto' => trim((string)Env::get('crossref_mailto', '')), + ]); + $summary = $svc->fetchWorkSummary($doiNorm); + if ($summary === null || empty($summary['doi'])) { + return jsonError('DOI not found or invalid (Crossref)'); } - $res = explode('.', $frag); - $f['author'] = prgeAuthor($res[0]); - $f['title'] = trim($res[1]); - $bj = bekjournal($res[2]); - $joura = formateJournal(trim($bj[0])); - $f['joura'] = $joura; - $f['dateno'] = str_replace(' ', '', str_replace('-', '–', trim($bj[1]))); - $f['doilink'] = strpos($data['doi'], "http") === false ? "http://doi.org/" . $data['doi'] : $data['doi']; - $re['formate'] = $f; - return jsonSuccess($re); + + $title = trim((string)($summary['title'] ?? '')); + $jouraRaw = trim((string)($summary['joura'] ?? '')); + $authorStr = trim((string)($summary['author_str'] ?? '')); + $dateno = trim((string)($summary['dateno'] ?? '')); + $doilink = trim((string)($summary['doilink'] ?? '')); + if ($doilink === '') { + $doilink = 'https://doi.org/' . $summary['doi']; + } + + $f = [ + 'author' => $authorStr !== '' ? prgeAuthor($authorStr) : '', + 'title' => $title, + 'joura' => $jouraRaw !== '' ? formateJournal($jouraRaw) : '', + 'dateno' => str_replace(' ', '', str_replace('-', '–', $dateno)), + 'doilink' => $doilink, + ]; + + $crossrefOut = $summary; + unset($crossrefOut['raw']); + + return jsonSuccess([ + 'formate' => $f, + 'crossref' => $crossrefOut, + 'doi' => $summary['doi'], + ]); } diff --git a/application/api/controller/Production.php b/application/api/controller/Production.php index fe29df2c..0b67c0d1 100644 --- a/application/api/controller/Production.php +++ b/application/api/controller/Production.php @@ -10,6 +10,7 @@ use think\Db; use think\Queue; use think\Validate; use think\log; +use app\common\ArticleSymbolNormalizer; /** * @title 公共管理相关 @@ -1380,6 +1381,10 @@ class Production extends Base return $html; } + public function testsym(){ + ArticleSymbolNormalizer::normalize(""); + } + public function doTypeSettingNew() { @@ -1399,7 +1404,7 @@ class Production extends Base $editor_info = $this->user_obj->where('user_id', $journal_info['editor_id'])->find(); $typesetInfo = []; - $typesetInfo['info_title'] = $p_info['title']; + $typesetInfo['info_title'] = ArticleSymbolNormalizer::normalize($p_info['title']); $typesetInfo['info_type'] = $p_info['type']; $typesetInfo['doi'] = $p_info['doi']; $typesetInfo['topic'] = ''; diff --git a/application/api/job/PlagiarismRun.php b/application/api/job/PlagiarismRun.php index 767edd5e..74d18d79 100644 --- a/application/api/job/PlagiarismRun.php +++ b/application/api/job/PlagiarismRun.php @@ -29,6 +29,7 @@ class PlagiarismRun return; } $svc = new PlagiarismService(); + $svc->log("PlagiarismRun job act!!"); $svc->runUploadAndTrigger($checkId, $filePath); $job->delete(); } diff --git a/application/common/ArticleSymbolNormalizer.php b/application/common/ArticleSymbolNormalizer.php new file mode 100644 index 00000000..c3515290 --- /dev/null +++ b/application/common/ArticleSymbolNormalizer.php @@ -0,0 +1,194 @@ + true, + 'fullwidth_space' => true, + 'collapse_spaces' => true, + 'remove_zwsp' => true, + 'comma_cjk' => true, + 'comma_latin' => true, + 'period_cjk' => true, + 'bracket_latin' => false, + 'decode_html_entities' => false, + 'english_journal' => false, + ], $options); + + if (!empty($o['english_journal'])) { + if (!array_key_exists('comma_cjk', $options)) { + $o['comma_cjk'] = false; + } + if (!array_key_exists('period_cjk', $options)) { + $o['period_cjk'] = false; + } + } + + if (!empty($o['decode_html_entities'])) { + $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + } + + if (!empty($o['line_endings'])) { + $text = str_replace(["\r\n", "\r"], "\n", $text); + } + if (!empty($o['fullwidth_space'])) { + $text = str_replace("\u{3000}", ' ', $text); + } + if (!empty($o['remove_zwsp'])) { + // 零宽空格、零宽非断空格、BOM、软连字符等(不改变可见字符) + $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}\x{00AD}]/u', '', $text); + } + if (!empty($o['collapse_spaces'])) { + $text = preg_replace('/[ \t]{2,}/u', ' ', $text); + } + + $han = self::$han; + + if (!empty($o['comma_cjk'])) { + // 汉字 , 汉字 → 汉字 , 汉字 + $text = preg_replace('/(?<=[' . $han . ']),(?=[' . $han . '])/u', ',', $text); + } + if (!empty($o['comma_latin'])) { + // 字母/数字 , 字母/数字 → , + $text = preg_replace('/(?<=[0-9A-Za-z]),(?=[0-9A-Za-z])/u', ',', $text); + } + if (!empty($o['period_cjk'])) { + // 汉字后的全角英文句点 FF0E → 中文句号 。 + $text = preg_replace('/(?<=[' . $han . '])./u', '。', $text); + } + if (!empty($o['bracket_latin'])) { + // ( 仅 ASCII + 常见标点 + 空格 ) + $text = preg_replace_callback( + '/(([0-9A-Za-z\s\.,;:\-\+/=]+))/u', + static function ($m) { + return '(' . $m[1] . ')'; + }, + $text + ); + } + + return $text; + } + + /** + * 对 HTML 片段做符号校对:只替换「标签外」的文本,不修改标签名与属性值。 + * + * 实现:按 `<...>` 切分,对偶数段(文本)调用 normalize(),奇数段(标签)原样保留。 + * 注意:畸形 HTML、属性值中含未转义 `<` 时可能误判,复杂场景请先抽纯文本再校对。 + * + * @param string $html + * @param array $options 同 normalize() + * @return string + */ + public static function normalizeHtml($html, array $options = []) + { + $html = (string)$html; + if ($html === '') { + return ''; + } + + $parts = preg_split('/(<[^>]*>)/u', $html, -1, PREG_SPLIT_DELIM_CAPTURE); + if ($parts === false) { + return self::normalize($html, $options); + } + + $out = ''; + foreach ($parts as $i => $chunk) { + if ($chunk === '') { + continue; + } + // 偶数索引为文本,奇数索引且以 < 开头为标签 + if ($i % 2 === 1 && isset($chunk[0]) && $chunk[0] === '<') { + $out .= $chunk; + } else { + $out .= self::normalize($chunk, $options); + } + } + + return $out; + } + + /** + * Abstract 专用:先 HTML 实体解码(> → > 等),再执行与普通正文相同的符号校对。 + * + * 适用于摘要字段在库中/接口中以 htmlspecialchars 形式存储的场景。 + * 若摘要内本身含真实 HTML 标签且需保留标签结构,请改用 normalizeHtml() 并自行传入 decode_html_entities。 + * + * @param string $abstract + * @param array $options 同 normalize(),默认会合并 decode_html_entities=true(可被显式 false 覆盖) + * @return string + */ + public static function normalizeAbstract($abstract, array $options = []) + { + $opts = array_merge(['decode_html_entities' => true], $options); + return self::normalize($abstract, $opts); + } + + /** + * 带 HTML 标签的摘要:仅在「标签外文本」中做实体解码 + 符号校对,不改动标签与属性。 + * + * @param string $html + * @param array $options 同 normalize(),默认 decode_html_entities=true + * @return string + */ + public static function normalizeAbstractHtml($html, array $options = []) + { + $opts = array_merge(['decode_html_entities' => true], $options); + return self::normalizeHtml($html, $opts); + } + + /** + * 英文期刊 Abstract:实体解码 + 符号校对,且默认关闭中文专用标点规则。 + */ + public static function normalizeEnglishAbstract($abstract, array $options = []) + { + return self::normalizeAbstract($abstract, array_merge(['english_journal' => true], $options)); + } + + /** + * 英文期刊、带 HTML 的摘要(标签外文本):实体解码 + 符号校对,且默认关闭中文专用规则。 + */ + public static function normalizeEnglishAbstractHtml($html, array $options = []) + { + return self::normalizeAbstractHtml($html, array_merge(['english_journal' => true], $options)); + } +} diff --git a/application/common/PlagiarismService.php b/application/common/PlagiarismService.php index cd48839d..a067a3de 100644 --- a/application/common/PlagiarismService.php +++ b/application/common/PlagiarismService.php @@ -34,6 +34,13 @@ class PlagiarismService */ const MAX_POLL_ATTEMPTS = 60; + private $logFile; + + public function __construct() + { + $this->logFile = ROOT_PATH . 'runtime' . DS . 'plagiarism_task.log'; + } + // ---------- 顶层入口 ---------- /** @@ -55,6 +62,7 @@ class PlagiarismService ->where('article_id', $articleId) ->value('journal_id'); + $now = time(); $checkId = Db::name('plagiarism_check')->insertGetId([ 'article_id' => $articleId, @@ -67,12 +75,12 @@ class PlagiarismService 'ctime' => $now, 'utime' => $now, ]); - + $this->log("submit service act"); // 入队执行:上传 + 触发 similarity Queue::push( 'app\\api\\job\\PlagiarismRun', ['check_id' => $checkId, 'file_path' => $filePath], - 'plagiarism' + 'PlagiarismRun' ); return (int)$checkId; @@ -84,7 +92,7 @@ class PlagiarismService public function runUploadAndTrigger($checkId, $filePath) { $check = $this->mustGetCheck($checkId); - + $this->log("runUploadAndTrigger is act0"); try { $tii = new TurnitinService(); @@ -95,7 +103,7 @@ class PlagiarismService if ($articleTitle === '') { $articleTitle = 'Article #' . $check['article_id']; } - + $this->log("runUploadAndTrigger is act1"); $createResp = $tii->createSubmission([ 'title' => mb_substr($articleTitle, 0, 250), 'owner' => 'editor_' . $check['triggered_by'], @@ -114,7 +122,7 @@ class PlagiarismService 'tii_submission_id' => $submissionId, 'raw_response' => json_encode($createResp, JSON_UNESCAPED_UNICODE), ]); - + $this->log("runUploadAndTrigger is act2"); // 2. 上传文件 $tii->uploadFile($submissionId, $filePath, basename($filePath)); @@ -127,12 +135,14 @@ class PlagiarismService 'raw_response' => json_encode($simResp, JSON_UNESCAPED_UNICODE), ]); + $this->log("runUploadAndTrigger is act3"); + // 4. 排队首次轮询(晚一点开始,让 Turnitin 先处理) Queue::later( self::POLL_INTERVAL, 'app\\api\\job\\PlagiarismPoll', ['check_id' => $checkId, 'attempt' => 1], - 'plagiarism' + 'PlagiarismPoll' ); } catch (\Throwable $e) { $this->markFailed($checkId, '[upload] ' . $e->getMessage()); @@ -320,6 +330,7 @@ class PlagiarismService private function markFailed($checkId, $errMsg) { + $this->log("markFailed act"); $this->updateCheck($checkId, [ 'state' => 4, 'error_msg' => mb_substr($errMsg, 0, 1000), @@ -337,7 +348,7 @@ class PlagiarismService $row = Db::name('article_file') ->where('article_id', $articleId) ->where('type_name', 'manuscirpt') // 历史拼写 - ->order('article_file_id desc') + ->order('file_id desc') ->find(); if (!$row || empty($row['file_url'])) { throw new Exception("article #{$articleId} has no manuscirpt file"); @@ -420,4 +431,10 @@ class PlagiarismService { return Db::name('plagiarism_check')->where('check_id', $checkId)->find(); } + + public function log($msg) + { + $line = date('Y-m-d H:i:s') . ' ' . $msg . PHP_EOL; + @file_put_contents($this->logFile, $line, FILE_APPEND); + } }