tougao/application/common/ArticleSymbolNormalizer.php

<?php

namespace app\common;

/**
 * 期刊文章内容「符号层」校对：只调整标点、空白、全角半角等，不增删语义文字。
 *
 * 设计原则：
 * - 默认规则保守，可通过 $options 逐项关闭；
 * - 纯文本用 normalize()；含 HTML 时用 normalizeHtml()（仅处理标签之间的文本段，避免破坏属性里的 URL）。
 * - Abstract 常用：存储时被转义为 &gt; &lt; &amp; 等，可用 normalizeAbstract() 先解码再符号校对。
 * - 英文期刊正文/摘要通常不含中文：设 english_journal=true（或 normalizeEnglishAbstract）可关闭仅针对汉字的规则。
 */
class ArticleSymbolNormalizer
{
    /** @var string 常用汉字 BMP 段（含扩展 A 前部，足够覆盖正文） */
    private static $han = '\x{4E00}-\x{9FFF}\x{3400}-\x{4DBF}';

    /**
     * 纯文本符号校对。
     *
     * @param string $text
     * @param array  $options 可选键（均为 bool，默认 true）：
     *   - line_endings        CRLF / CR → LF
     *   - fullwidth_space     U+3000 全角空格 → 普通空格
     *   - collapse_spaces     连续半角空格（不含换行）压成单个空格
     *   - remove_zwsp         删除零宽空格等不可见格式字符（不改变可见字）
     *   - comma_cjk           两个汉字之间的英文逗号「,」→「，」
     *   - comma_latin         两个 ASCII 字母/数字之间的全角逗号「，」→「,」
     *   - period_cjk          汉字后的全角句点「．」(U+FF0E) →「。」
     *   - bracket_latin       仅由 ASCII 标识包裹时「（）」→「()」（如 (a) 类简单情形，保守：仅当括号内全为 ASCII）
     *   - decode_html_entities 将 &gt; &lt; &amp; &quot; &#39; 及数字实体等转为真实字符（默认 false；abstract 见 normalizeAbstract）
     *   - english_journal      英文期刊：关闭「两汉字间英文逗号→，」「汉字后 FF0E→。」等中文专用规则（默认 false；见 normalizeEnglishAbstract）
     *
     * @return string
     */
    public static function normalize($text, array $options = [])
    {
        $text = (string)$text;
        if ($text === '') {
            return '';
        }

        $o = array_merge([
            'line_endings'         => true,
            'fullwidth_space'      => true,
            'collapse_spaces'      => true,
            'remove_zwsp'          => true,
            'comma_cjk'            => true,
            'comma_latin'          => true,
            'period_cjk'           => true,
            'bracket_latin'        => false,
            'decode_html_entities' => false,
            'english_journal'      => false,
        ], $options);

        if (!empty($o['english_journal'])) {
            if (!array_key_exists('comma_cjk', $options)) {
                $o['comma_cjk'] = false;
            }
            if (!array_key_exists('period_cjk', $options)) {
                $o['period_cjk'] = false;
            }
        }

        if (!empty($o['decode_html_entities'])) {
            $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
        }

        if (!empty($o['line_endings'])) {
            $text = str_replace(["\r\n", "\r"], "\n", $text);
        }
        if (!empty($o['fullwidth_space'])) {
            $text = str_replace("\u{3000}", ' ', $text);
        }
        if (!empty($o['remove_zwsp'])) {
            // 零宽空格、零宽非断空格、BOM、软连字符等（不改变可见字符）
            $text = preg_replace('/[\x{200B}-\x{200D}\x{FEFF}\x{00AD}]/u', '', $text);
        }
        if (!empty($o['collapse_spaces'])) {
            $text = preg_replace('/[ \t]{2,}/u', ' ', $text);
        }

        $han = self::$han;

        if (!empty($o['comma_cjk'])) {
            // 汉字 , 汉字 → 汉字 ， 汉字
            $text = preg_replace('/(?<=[' . $han . ']),(?=[' . $han . '])/u', '，', $text);
        }
        if (!empty($o['comma_latin'])) {
            // 字母/数字 ， 字母/数字 → ,
            $text = preg_replace('/(?<=[0-9A-Za-z])，(?=[0-9A-Za-z])/u', ',', $text);
        }
        if (!empty($o['period_cjk'])) {
            // 汉字后的全角英文句点 FF0E → 中文句号 。
            $text = preg_replace('/(?<=[' . $han . '])．/u', '。', $text);
        }
        if (!empty($o['bracket_latin'])) {
            // （ 仅 ASCII + 常见标点 + 空格 ）
            $text = preg_replace_callback(
                '/（([0-9A-Za-z\s\.,;:\-\+/=]+)）/u',
                static function ($m) {
                    return '(' . $m[1] . ')';
                },
                $text
            );
        }

        return $text;
    }

    /**
     * 对 HTML 片段做符号校对：只替换「标签外」的文本，不修改标签名与属性值。
     *
     * 实现：按 `<...>` 切分，对偶数段（文本）调用 normalize()，奇数段（标签）原样保留。
     * 注意：畸形 HTML、属性值中含未转义 `<` 时可能误判，复杂场景请先抽纯文本再校对。
     *
     * @param string $html
     * @param array  $options 同 normalize()
     * @return string
     */
    public static function normalizeHtml($html, array $options = [])
    {
        $html = (string)$html;
        if ($html === '') {
            return '';
        }

        $parts = preg_split('/(<[^>]*>)/u', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
        if ($parts === false) {
            return self::normalize($html, $options);
        }

        $out = '';
        foreach ($parts as $i => $chunk) {
            if ($chunk === '') {
                continue;
            }
            // 偶数索引为文本，奇数索引且以 < 开头为标签
            if ($i % 2 === 1 && isset($chunk[0]) && $chunk[0] === '<') {
                $out .= $chunk;
            } else {
                $out .= self::normalize($chunk, $options);
            }
        }

        return $out;
    }

    /**
     * Abstract 专用：先 HTML 实体解码（&gt; → > 等），再执行与普通正文相同的符号校对。
     *
     * 适用于摘要字段在库中/接口中以 htmlspecialchars 形式存储的场景。
     * 若摘要内本身含真实 HTML 标签且需保留标签结构，请改用 normalizeHtml() 并自行传入 decode_html_entities。
     *
     * @param string $abstract
     * @param array  $options 同 normalize()，默认会合并 decode_html_entities=true（可被显式 false 覆盖）
     * @return string
     */
    public static function normalizeAbstract($abstract, array $options = [])
    {
        $opts = array_merge(['decode_html_entities' => true], $options);
        return self::normalize($abstract, $opts);
    }

    /**
     * 带 HTML 标签的摘要：仅在「标签外文本」中做实体解码 + 符号校对，不改动标签与属性。
     *
     * @param string $html
     * @param array  $options 同 normalize()，默认 decode_html_entities=true
     * @return string
     */
    public static function normalizeAbstractHtml($html, array $options = [])
    {
        $opts = array_merge(['decode_html_entities' => true], $options);
        return self::normalizeHtml($html, $opts);
    }

    /**
     * 英文期刊 Abstract：实体解码 + 符号校对，且默认关闭中文专用标点规则。
     */
    public static function normalizeEnglishAbstract($abstract, array $options = [])
    {
        return self::normalizeAbstract($abstract, array_merge(['english_journal' => true], $options));
    }

    /**
     * 英文期刊、带 HTML 的摘要（标签外文本）：实体解码 + 符号校对，且默认关闭中文专用规则。
     */
    public static function normalizeEnglishAbstractHtml($html, array $options = [])
    {
        return self::normalizeAbstractHtml($html, array_merge(['english_journal' => true], $options));
    }
}