From 1e2e8146cc6cece6c1af51b9abc8e06b154b536b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=8B=E4=BA=8E=E5=88=9D=E8=A7=81?= <752204717@qq.com> Date: Thu, 16 Apr 2026 11:02:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=A1=E7=A8=BF=E6=A0=A1=E9=AA=8C=E7=A9=BA?= =?UTF-8?q?=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/common/js/commonJS.js | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/common/js/commonJS.js b/src/common/js/commonJS.js index eae4459..3f15913 100644 --- a/src/common/js/commonJS.js +++ b/src/common/js/commonJS.js @@ -1246,35 +1246,28 @@ str = str.replace(regex, function (match, content, offset, fullString) { getCleanTextForCount(html) { if (!html) return ""; - - // 创建临时容器解析 HTML const tempDiv = document.createElement('div'); tempDiv.innerHTML = html; - // A. 特殊处理 wmath:只拿它里面的公式文本,扔掉里面生成的 mjx-container 等标签 + // 1. 处理公式 (保留公式文本内容) const wmaths = tempDiv.querySelectorAll('wmath'); wmaths.forEach(wm => { - // textContent 会拿到最原始的公式字符,忽略内部所有标签 - const textNode = document.createTextNode(wm.textContent); + const textNode = document.createTextNode(" " + wm.textContent + " "); wm.parentNode.replaceChild(textNode, wm); }); - // B. 获取现在的 HTML 内容 - let result = tempDiv.innerHTML; + // 2. 移除所有引用标签 [1], [2] (防止用户靠狂刷引用来凑字数) + const refs = tempDiv.querySelectorAll('span.reference-link, a.ref'); // 根据你系统的 class 名调整 + refs.forEach(r => r.remove()); - // C. 去掉特定排版标签的“壳”(保留里面的文字) - // 包含 b, strong, br, em, i, sup, sub 等 - result = result.replace(/<(p|div|b|strong|br|em|i|sup|sub)[^>]*>/gi, ""); - result = result.replace(/<\/(p|div|b|strong|br|em|i|sup|sub)>/gi, ""); + // 3. 获取纯文本 (textContent 是浏览器原生方法,能处理所有标签及其属性) + let text = tempDiv.textContent || tempDiv.innerText || ""; - - - // E. 彻底“脱水”:去掉 HTML 实体、换行符、所有空格 - result = result.replace(/ /ig, ""); - result = result.replace(/[\r\n\t]/g, ""); - result = result.replace(/\s+/g, ""); - - return result; + // 4. 标准化空格:将 HTML 实体、换行、多个空格统一转为一个空格 + return text.replace(/ /ig, " ") + .replace(/[\r\n\t]+/g, " ") + .replace(/\s+/g, " ") + .trim(); },