修改自动推广的相关任务

2026-05-13 12:26:28 +08:00
parent c36eba77b1
commit fa878334cd
7 changed files with 289 additions and 29 deletions
--- a/application/common/ArticleSymbolNormalizer.php
+++ b/application/common/ArticleSymbolNormalizer.php
@@ -0,0 +1,194 @@
+<?php
+
+namespace app\common;
+
+/**
+ * 期刊文章内容「符号层」校对：只调整标点、空白、全角半角等，不增删语义文字。
+ *
+ * 设计原则：
+ * - 默认规则保守，可通过 $options 逐项关闭；
+ * - 纯文本用 normalize()；含 HTML 时用 normalizeHtml()（仅处理标签之间的文本段，避免破坏属性里的 URL）。
+ * - Abstract 常用：存储时被转义为 &gt; &lt; &amp; 等，可用 normalizeAbstract() 先解码再符号校对。
+ * - 英文期刊正文/摘要通常不含中文：设 english_journal=true（或 normalizeEnglishAbstract）可关闭仅针对汉字的规则。
+ */
+class ArticleSymbolNormalizer
+{
+    /** @var string 常用汉字 BMP 段（含扩展 A 前部，足够覆盖正文） */
+    private static $han = '\x{4E00}-\x{9FFF}\x{3400}-\x{4DBF}';
+
+    /**
+     * 纯文本符号校对。
+     *
+     * @param string $text
+     * @param array  $options 可选键（均为 bool，默认 true）：
+     *   - line_endings        CRLF / CR → LF
+     *   - fullwidth_space     U+3000 全角空格 → 普通空格
+     *   - collapse_spaces     连续半角空格（不含换行）压成单个空格
+     *   - remove_zwsp         删除零宽空格等不可见格式字符（不改变可见字）
+     *   - comma_cjk           两个汉字之间的英文逗号「,」→「，」
+     *   - comma_latin         两个 ASCII 字母/数字之间的全角逗号「，」→「,」
+     *   - period_cjk          汉字后的全角句点「．」(U+FF0E) →「。」
+     *   - bracket_latin       仅由 ASCII 标识包裹时「（）」→「()」（如 (a) 类简单情形，保守：仅当括号内全为 ASCII）
+     *   - decode_html_entities 将 &gt; &lt; &amp; &quot; &#39; 及数字实体等转为真实字符（默认 false；abstract 见 normalizeAbstract）
+     *   - english_journal      英文期刊：关闭「两汉字间英文逗号→，」「汉字后 FF0E→。」等中文专用规则（默认 false；见 normalizeEnglishAbstract）
+     *
+     * @return string
+     */
+    public static function normalize($text, array $options = [])
+    {
+        $text = (string)$text;
+        if ($text === '') {
+            return '';
+        }
+
+        $o = array_merge([
+            'line_endings'         => true,
+            'fullwidth_space'      => true,
+            'collapse_spaces'      => true,
+            'remove_zwsp'          => true,
+            'comma_cjk'            => true,
+            'comma_latin'          => true,
+            'period_cjk'           => true,
+            'bracket_latin'        => false,
+            'decode_html_entities' => false,
+            'english_journal'      => false,
+        ], $options);
+
+        if (!empty($o['english_journal'])) {
+            if (!array_key_exists('comma_cjk', $options)) {
+                $o['comma_cjk'] = false;
+            }
+            if (!array_key_exists('period_cjk', $options)) {
+                $o['period_cjk'] = false;
+            }
+        }
+
+        if (!empty($o['decode_html_entities'])) {
+            $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
+        }
+
+        if (!empty($o['line_endings'])) {
+            $text = str_replace(["\r\n", "\r"], "\n", $text);
+        }
+        if (!empty($o['fullwidth_space'])) {
+            $text = str_replace("\u{3000}", ' ', $text);
+        }
+        if (!empty($o['remove_zwsp'])) {
+            // 零宽空格、零宽非断空格、BOM、软连字符等（不改变可见字符）
+            $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}\x{00AD}]/u', '', $text);
+        }
+        if (!empty($o['collapse_spaces'])) {
+            $text = preg_replace('/[ \t]{2,}/u', ' ', $text);
+        }
+
+        $han = self::$han;
+
+        if (!empty($o['comma_cjk'])) {
+            // 汉字 , 汉字 → 汉字 ， 汉字
+            $text = preg_replace('/(?<=[' . $han . ']),(?=[' . $han . '])/u', '，', $text);
+        }
+        if (!empty($o['comma_latin'])) {
+            // 字母/数字 ， 字母/数字 → ,
+            $text = preg_replace('/(?<=[0-9A-Za-z])，(?=[0-9A-Za-z])/u', ',', $text);
+        }
+        if (!empty($o['period_cjk'])) {
+            // 汉字后的全角英文句点 FF0E → 中文句号 。
+            $text = preg_replace('/(?<=[' . $han . '])．/u', '。', $text);
+        }
+        if (!empty($o['bracket_latin'])) {
+            // （ 仅 ASCII + 常见标点 + 空格 ）
+            $text = preg_replace_callback(
+                '/（([0-9A-Za-z\s\.,;:\-\+/=]+)）/u',
+                static function ($m) {
+                    return '(' . $m[1] . ')';
+                },
+                $text
+            );
+        }
+
+        return $text;
+    }
+
+    /**
+     * 对 HTML 片段做符号校对：只替换「标签外」的文本，不修改标签名与属性值。
+     *
+     * 实现：按 `<...>` 切分，对偶数段（文本）调用 normalize()，奇数段（标签）原样保留。
+     * 注意：畸形 HTML、属性值中含未转义 `<` 时可能误判，复杂场景请先抽纯文本再校对。
+     *
+     * @param string $html
+     * @param array  $options 同 normalize()
+     * @return string
+     */
+    public static function normalizeHtml($html, array $options = [])
+    {
+        $html = (string)$html;
+        if ($html === '') {
+            return '';
+        }
+
+        $parts = preg_split('/(<[^>]*>)/u', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
+        if ($parts === false) {
+            return self::normalize($html, $options);
+        }
+
+        $out = '';
+        foreach ($parts as $i => $chunk) {
+            if ($chunk === '') {
+                continue;
+            }
+            // 偶数索引为文本，奇数索引且以 < 开头为标签
+            if ($i % 2 === 1 && isset($chunk[0]) && $chunk[0] === '<') {
+                $out .= $chunk;
+            } else {
+                $out .= self::normalize($chunk, $options);
+            }
+        }
+
+        return $out;
+    }
+
+    /**
+     * Abstract 专用：先 HTML 实体解码（&gt; → > 等），再执行与普通正文相同的符号校对。
+     *
+     * 适用于摘要字段在库中/接口中以 htmlspecialchars 形式存储的场景。
+     * 若摘要内本身含真实 HTML 标签且需保留标签结构，请改用 normalizeHtml() 并自行传入 decode_html_entities。
+     *
+     * @param string $abstract
+     * @param array  $options 同 normalize()，默认会合并 decode_html_entities=true（可被显式 false 覆盖）
+     * @return string
+     */
+    public static function normalizeAbstract($abstract, array $options = [])
+    {
+        $opts = array_merge(['decode_html_entities' => true], $options);
+        return self::normalize($abstract, $opts);
+    }
+
+    /**
+     * 带 HTML 标签的摘要：仅在「标签外文本」中做实体解码 + 符号校对，不改动标签与属性。
+     *
+     * @param string $html
+     * @param array  $options 同 normalize()，默认 decode_html_entities=true
+     * @return string
+     */
+    public static function normalizeAbstractHtml($html, array $options = [])
+    {
+        $opts = array_merge(['decode_html_entities' => true], $options);
+        return self::normalizeHtml($html, $opts);
+    }
+
+    /**
+     * 英文期刊 Abstract：实体解码 + 符号校对，且默认关闭中文专用标点规则。
+     */
+    public static function normalizeEnglishAbstract($abstract, array $options = [])
+    {
+        return self::normalizeAbstract($abstract, array_merge(['english_journal' => true], $options));
+    }
+
+    /**
+     * 英文期刊、带 HTML 的摘要（标签外文本）：实体解码 + 符号校对，且默认关闭中文专用规则。
+     */
+    public static function normalizeEnglishAbstractHtml($html, array $options = [])
+    {
+        return self::normalizeAbstractHtml($html, array_merge(['english_journal' => true], $options));
+    }
+}