diff --git a/application/api/controller/Article.php b/application/api/controller/Article.php index fbdac98..8cc1610 100644 --- a/application/api/controller/Article.php +++ b/application/api/controller/Article.php @@ -95,6 +95,7 @@ class Article extends Base if ($data['name'] != '') { $where['t_article.title'] = array('like', "%" . $data['name'] . "%"); } + //分页查询数据 $limit_start = ($data['pageIndex'] - 1) * $data['pageSize']; $res = $this->article_obj->field('t_article.*,t_journal.title journalname') @@ -103,6 +104,7 @@ class Article extends Base ->order('article_id desc') ->limit($limit_start, $data['pageSize'])->select(); $count = $this->article_obj->where($where)->count(); + foreach ($res as $key => $val) { //查询建议转投详情 $transfer_info = $this->article_transfer_obj @@ -499,7 +501,7 @@ class Article extends Base $data = $this->request->post(); //查询文章基础数据 $where['t_article.article_id'] = $data['articleId']; - $article_res = $this->article_obj->field('t_article.*,t_journal.title journalname,t_user.account')->join(array(['t_journal', 't_journal.journal_id = t_article.journal_id', 'LEFT'], ['t_user', 't_user.user_id = t_article.user_id', 'LEFT']))->where($where)->find(); + $article_res = $this->article_obj->field('t_article.*,t_journal.title journalname,t_user.account,t_user.email as user_email')->join(array(['t_journal', 't_journal.journal_id = t_article.journal_id', 'LEFT'], ['t_user', 't_user.user_id = t_article.user_id', 'LEFT']))->where($where)->find(); //查询文章状态跟踪信息 $article_msg = $this->article_msg_obj->where(['article_id' => $data['articleId']])->where('state', 0)->select(); //查询审稿人审稿建议 @@ -1586,15 +1588,10 @@ class Article extends Base $this->addProductionEx($data['articleId']); $this->addArticleMainEx($data['articleId']); - //处理缴费相关信息 - if($journal_info['fee']==0){ + //如果是免费的期刊文章,那么直接变成付款完成 + if ($journal_info['fee'] == 0 || $article_info['ctime'] < 1735660800) { $this->article_obj->where("article_id", $article_info['article_id'])->update(["is_buy" => 1]); - }else{ - $this->article_obj->where("article_id", $article_info['article_id'])->update(["fee" => $journal_info['fee']]); } -// if ($journal_info['fee'] == 0 || $article_info['ctime'] < 1735660800) { -// $this->article_obj->where("article_id", $article_info['article_id'])->update(["is_buy" => 1]); -// } } } diff --git a/application/api/controller/Crontask.php b/application/api/controller/Crontask.php index 4e09a0a..dc7ba4c 100644 --- a/application/api/controller/Crontask.php +++ b/application/api/controller/Crontask.php @@ -2,9 +2,11 @@ namespace app\api\controller; use think\Controller; use think\Db; +use app\common\Reviewer; class Crontask extends Controller { + protected $iChunkSize = 500; /** * 批量处理审稿人审稿质量 * @return void @@ -144,7 +146,8 @@ class Crontask extends Controller //获取该文章审核人的信息 $aWhere = [ 'ctime'=>['>',$sDate], - 'state'=>['in',[1,2,3]] + // 'state'=>['in',[1,2,3]] + 'state' => ['BETWEEN',[1,3]] ]; $aReviewer = Db::name('article_reviewer')->field('reviewer_id,count(article_id) as review_num ')->where($aWhere)->order('reviewer_id asc')->group('reviewer_id')->select(); @@ -159,8 +162,10 @@ class Crontask extends Controller exit; } $aUser = empty($aUser) ? [] : array_column($aUser, null,'user_id'); + //分片处理数量 + $iChunkSize = 200; if(!empty($aReviewer)){ - $aChunk = array_chunk($aReviewer, 100); + $aChunk = array_chunk($aReviewer, $iChunkSize); Db::startTrans(); foreach ($aChunk as $key => $item) { //数据分片操作 //需要更新的用户ID @@ -174,8 +179,7 @@ class Crontask extends Controller //拼接更新语句 if(empty($aUser[$iUserId])){ //更新数量 - $aCase['review_num'] .= "WHEN {$iUserId} THEN "; - $aCase['review_num'] .= "'{$value['review_num']}' "; + $aCase['review_num'] .= "WHEN {$iUserId} THEN '{$value['review_num']}' "; $aUpdateId[] = $iUserId; continue; } @@ -185,8 +189,7 @@ class Crontask extends Controller continue; } //审核数量有,变化更新数量 - $aCase['review_num'] .= "WHEN {$iUserId} THEN "; - $aCase['review_num'] .= "'{$value['review_num']}' "; + $aCase['review_num'] .= "WHEN {$iUserId} THEN '{$value['review_num']}' "; $aUpdateId[] = $iUserId; unset($aUser[$iUserId]); } @@ -198,6 +201,7 @@ class Crontask extends Controller } $result = Db::name('user') ->where(['user_id' => ['in',$aUpdateId]]) + ->limit(count($aUpdateId)) ->update([ 'review_num' => Db::raw($aCase['review_num']), ]); @@ -210,14 +214,14 @@ class Crontask extends Controller Db::commit(); } if(!empty($aUser)){ - $aChunk = array_chunk($aUser, 100); + $aChunk = array_chunk($aUser, $iChunkSize); Db::startTrans(); foreach ($aChunk as $key => $item) { //数据分片操作 $aUserId = array_column($item, 'user_id'); if(empty($aUserId)){ continue; } - $result = Db::name('user')->where(['is_reviewer' => 1,'user_id' => ['in',$aUserId]]) + $result = Db::name('user')->where(['user_id' => ['in',$aUserId]]) ->limit(count($aUserId)) ->update([ 'review_num' => 0, @@ -235,11 +239,11 @@ class Crontask extends Controller } /** - * @title 审稿人拒绝审稿[超过七日默认自动拒绝审稿] + * @title 审稿人拒绝审稿[超过5日默认自动拒绝审稿] * */ public function refuseReviewArticle(){ - $sDate = strtotime(date('Y-m-d 00:00:00', strtotime('-7 day'))); + $sDate = strtotime(date('Y-m-d 00:00:00', strtotime('-5 day'))); //获取该文章审核人的信息 $aWhere = [ 'ctime'=>['<',$sDate], @@ -252,11 +256,14 @@ class Crontask extends Controller $this->showMessage('未查询到需要超过七日的稿件',2); exit; } - + //分片处理数量 + $iChunkSize = $this->iChunkSize; + //扣减分值 + $iScore = 3; + //更新审稿人未审稿的数量 + $aChunkReviewerNum = array_chunk($aReviewerNum, $iChunkSize); //更新超过七日未审核的数据 Db::startTrans(); - //更新审稿人未审稿的数量 - $aChunkReviewerNum = array_chunk($aReviewerNum, 500); foreach ($aChunkReviewerNum as $key => $value) { $aId = array_column($value, 'reviewer_id'); if(empty($aId)){ @@ -270,26 +277,38 @@ class Crontask extends Controller }else{ $this->showMessage('更新审稿人审稿状态成功['.$key.']执行SQL条数:'.$iResult."\n",1); } - $aCase = $aUpdateId = []; - $sRdNum = ''; + $aUpdateId = []; + $aCase = ['rd_num' => '','review_score' => '']; foreach ($value as $key => $item) { if($item['reviewer_id'] <=0){ continue; } //审核数量有,变化更新数量 - $sRdNum .= "WHEN {$item['reviewer_id']} THEN "; - $sRdNum .= Db::raw("rd_num + {$item['num']}")." "; + $aCase['rd_num'] .= "WHEN {$item['reviewer_id']} THEN "; + $aCase['rd_num'] .= Db::raw("rd_num + {$item['num']}")." "; + + //扣减分数 + $aCase['review_score'] .= "WHEN {$item['reviewer_id']} THEN "; + $aCase['review_score'] .= Db::raw("review_score - {$iScore}")." "; + $aUpdateId[] = $item['reviewer_id']; } - //SQL拼接最后结尾 - $aCase['rd_num'] ='CASE user_id '.$sRdNum.'END'; + //更新数据库 + foreach ($aCase as $kk => $value) { + if(empty($value)){ + continue; + } + $aUpdateCase[$kk] = Db::raw('CASE user_id '.$value.'END'); + } + if(empty($aUpdateCase) || empty($aId)){ + $this->showMessage('未查询到满足要求的审稿人数据['.$key.']'."\n",2); + continue; + } //执行更新 $result = Db::name('user') ->where(['user_id' => ['in',$aUpdateId]]) ->limit(count($aUpdateId)) - ->update([ - 'rd_num' => Db::raw($aCase['rd_num']), - ]); + ->update($aUpdateCase); if ($result === false) { $this->showMessage('更新用户拒绝审稿数量失败['.$key.']执行SQL:'.Db::getLastSql()."\n",2); }else{ @@ -333,8 +352,7 @@ class Crontask extends Controller continue; } //审核数量有,变化更新数量 - $sRdNum .= "WHEN {$item['reviewer_id']} THEN "; - $sRdNum .= "{$item['num']} "; + $sRdNum .= "WHEN {$item['reviewer_id']} THEN {$item['num']} "; $aUpdateId[] = $item['reviewer_id']; } //SQL拼接最后结尾 @@ -360,7 +378,7 @@ class Crontask extends Controller /** - * 批量处理审稿人活跃度[近两年] + * 批量处理审稿人活跃度/同意审稿数量[近两年] * * @return void */ @@ -368,64 +386,98 @@ class Crontask extends Controller $sDate = strtotime(date('Y-m-d 00:00:00', strtotime('-2 year'))); //获取该文章审核人的信息 $aWhere = [ - 'ctime'=>['>',$sDate], - 'state'=>['in',[1,2,3]] + 'ctime' => ['>',$sDate], + 'state' => ['BETWEEN',[0,5]] ]; - $aReviewer = Db::name('article_reviewer')->field('reviewer_id,count(article_id) as review_num_two_year')->where($aWhere)->order('reviewer_id asc')->group('reviewer_id')->select(); + $aReviewer = Db::name('article_reviewer')->field('reviewer_id,SUM(state IN (0,1,2,3)) AS review_agree_num,SUM(state IN (1,2,3)) AS review_activity_num,SUM(state = 4) AS review_refuse_num,SUM(state IN (0,1,2,3,4,5)) AS review_invite_num')->where($aWhere)->order('reviewer_id asc')->group('reviewer_id')->select(); //查询审稿人数量不为0的审稿信息 - $aUserWhere = [ - // 'is_reviewer' => 1, - 'review_num_two_year' => ['>',0] - ]; - $aUser = Db::name('user')->field('user_id,review_num_two_year')->where($aUserWhere)->select(); + $aUser = Db::name('user')->field('user_id,review_activity_num,review_agree_num,review_refuse_num,review_invite_num')->where('review_invite_num','>',0)->select(); + if(empty($aReviewer) && empty($aUser)){ $this->showMessage('未查询到待处理的审稿人数据【近两年】',2); exit; } + + //处理数量 + $iChunkSize = $this->iChunkSize; $aUser = empty($aUser) ? [] : array_column($aUser, null,'user_id'); + //分片处理数据 if(!empty($aReviewer)){ - $aChunk = array_chunk($aReviewer, 100); + $aChunk = array_chunk($aReviewer, $iChunkSize); Db::startTrans(); foreach ($aChunk as $key => $item) { //数据分片操作 //需要更新的用户ID $aUpdateId = []; //SQL拼接 - $aCase['review_num_two_year'] = 'CASE user_id '; + $aCase = ['review_activity_num' => [], 'review_agree_num' => [],'review_refuse_num' => [],'review_invite_num' => [],'review_agree_rate' => []]; foreach ($item as $key => $value) { //用户ID $iUserId = $value['reviewer_id']; + //活跃度 + $review_activity_num = empty($value['review_activity_num']) ? 0 : $value['review_activity_num']; + //同意审稿数量 + $review_agree_num = empty($value['review_agree_num']) ? 0 : $value['review_agree_num']; + //拒绝审稿数量 + $review_refuse_num = empty($value['review_refuse_num']) ? 0 : $value['review_refuse_num']; + //邀请审稿数量 + $review_invite_num = empty($value['review_invite_num']) ? 0 : $value['review_invite_num']; + //同意审稿占比 + $review_agree_rate = empty($review_invite_num) ? 0 : round($review_agree_num/$review_invite_num,2); + //用户信息 + $aUserInfo = empty($aUser[$iUserId]) ? [] : $aUser[$iUserId]; //拼接更新语句 - if(empty($aUser[$iUserId])){ - //更新数量 - $aCase['review_num_two_year'] .= "WHEN {$iUserId} THEN "; - $aCase['review_num_two_year'] .= "'{$value['review_num_two_year']}' "; + if(empty($aUserInfo)){ + //更新活跃度-审稿数量 + $aCase['review_activity_num'][]= "WHEN {$iUserId} THEN '{$review_activity_num}' "; + //更新同意审稿数量 + $aCase['review_agree_num'][] = "WHEN {$iUserId} THEN '{$review_agree_num}' "; + //更新拒绝审稿数量 + $aCase['review_refuse_num'][] = "WHEN {$iUserId} THEN '{$review_refuse_num}' "; + //更新邀请审稿数量 + $aCase['review_invite_num'][] = "WHEN {$iUserId} THEN '{$review_invite_num}' "; + //同意审稿占比 + $aCase['review_agree_rate'][] = "WHEN {$iUserId} THEN '{$review_agree_rate}' "; + $aUpdateId[] = $iUserId; continue; } - //审核数量无变化,跳过更新 - if($aUser[$iUserId]['review_num_two_year'] == $value['review_num_two_year']){ + //数量无变化,跳过更新 + if($aUserInfo['review_activity_num'] == $value['review_activity_num'] && $aUserInfo['review_agree_num'] == $value['review_agree_num'] && $aUserInfo['review_refuse_num'] == $value['review_refuse_num'] && $aUserInfo['review_invite_num'] == $value['review_invite_num']){ unset($aUser[$iUserId]); continue; } - //审核数量有,变化更新数量 - $aCase['review_num_two_year'] .= "WHEN {$iUserId} THEN "; - $aCase['review_num_two_year'] .= "'{$value['review_num_two_year']}' "; + + //更新活跃度-审稿数量 + $aCase['review_activity_num'][] = "WHEN {$iUserId} THEN '{$review_activity_num}' "; + //更新同意审稿数量 + $aCase['review_agree_num'][] = "WHEN {$iUserId} THEN '{$review_agree_num}' "; + //更新拒绝审稿数量 + $aCase['review_refuse_num'][] = "WHEN {$iUserId} THEN '{$review_refuse_num}' "; + //更新邀请审稿数量 + $aCase['review_invite_num'][] = "WHEN {$iUserId} THEN '{$review_invite_num}' "; + //同意审稿占比 + $aCase['review_agree_rate'][] = "WHEN {$iUserId} THEN '{$review_agree_rate}' "; $aUpdateId[] = $iUserId; unset($aUser[$iUserId]); - } - //SQL拼接最后结尾 - $aCase['review_num_two_year'] .= 'END'; - //执行更新 - if(empty($aUpdateId)){ + } + //更新数据库 + foreach ($aCase as $kk => $value) { + if(empty($value)){ + continue; + } + $sWhere = implode(" ", $value); + $aUpdateCase[$kk] = Db::raw('CASE user_id '.$sWhere.'END'); + } + if(empty($aUpdateCase) || empty($aUpdateId)){ + $this->showMessage('未查询到满足要求的审稿人数据['.$key.']'."\n",2); continue; - } - $result = Db::name('user') - ->where(['user_id' => ['in',$aUpdateId]]) - ->update([ - 'review_num_two_year' => Db::raw($aCase['review_num_two_year']), - ]); + } + //更新数据 + $result = Db::name('user') + ->where(['user_id' => ['in',$aUpdateId]]) + ->update($aUpdateCase); if ($result === false) { $this->showMessage('更新近两年审稿人审核数量失败['.$key.']执行SQL:'.Db::getLastSql()."\n",2); }else{ @@ -435,7 +487,7 @@ class Crontask extends Controller Db::commit(); } if(!empty($aUser)){ - $aChunk = array_chunk($aUser, 100); + $aChunk = array_chunk($aUser, $iChunkSize); Db::startTrans(); foreach ($aChunk as $key => $item) { //数据分片操作 $aUserId = array_column($item, 'user_id'); @@ -445,7 +497,11 @@ class Crontask extends Controller $result = Db::name('user')->where(['user_id' => ['in',$aUserId]]) ->limit(count($aUserId)) ->update([ - 'review_num' => 0, + 'review_activity_num' => 0, + 'review_agree_num' => 0, + 'review_refuse_num' => 0, + 'review_refuse_num' => 0, + 'review_agree_rate' => 0, ]); if ($result === false) { $this->showMessage('清空近两年审稿人审核数量失败['.$key.']执行SQL:'.Db::getLastSql()."\n",2); @@ -458,13 +514,44 @@ class Crontask extends Controller } $this->showMessage('批量更新近两年审稿人审核数量成功'."\n",1); } + + /** + * 批量处理待审核的文章自动推荐审稿人 + * + * @return void + */ + public function recommendedReviewer(){ + + $this->showMessage('批量处理待审核的文章自动推荐审稿人成功'."\n",1); + exit; + //查询条件 + $aWhere = ['state' => 2]; + $aWhere['user_id'] = 54; + $aArticle = Db::name('article')->field('article_id,accept_sn')->where($aWhere)->limit(1)->select(); + if(empty($aArticle)){ + $this->showMessage('未查询到需要处理的待审核的文章',2); + exit; + } + //数据处理 + foreach ($aArticle as $key => $value) { + $iArticleId = empty($value['article_id']) ? 0 : $value['article_id']; + if(empty($iArticleId)){ + continue; + } + $sQueueId = \think\Queue::push('app\api\job\RecommendReviewer@fire',['article_id' =>$iArticleId], 'RecommendReviewer'); + if($sQueueId === false){ + $this->showMessage('文章入队失败,文章ID:'.$value['article_id'].'['.$value['accept_sn']."]\n",2); + continue; + } + } + $this->showMessage('批量处理待审核的文章自动推荐审稿人成功'."\n",1); + } /** * * 格式化信息输出 * * @access public * @return void - * @author huangpu * @date 2018.09.28 * @param $[message] [<显示信息>] * @param $[status] [<输出信息1成功,2失败>] @@ -477,4 +564,6 @@ class Crontask extends Controller } echo date("Y-m-d H:i:s") . " " . $message . "\n"; } + + } diff --git a/application/api/controller/Production.php b/application/api/controller/Production.php index de8f778..87e34f4 100644 --- a/application/api/controller/Production.php +++ b/application/api/controller/Production.php @@ -1854,11 +1854,6 @@ class Production extends Base $this->referToDoi($data['p_article_id']); $this->doiTofrag($data['p_article_id']); - //写入获取参考文献详情队列 chengxiaoling 20251127 start - if(!empty($data['p_article_id'])){ - \think\Queue::push('app\api\job\AiCheckRefer@fire',['p_article_id' => $data['p_article_id']],'AiCheckRefer'); - } - //写入获取参考文献详情队列 chengxiaoling 20251127 end return jsonSuccess([]); } @@ -1944,7 +1939,11 @@ class Production extends Base if ($v['refer_doi'] == '') { $this->production_article_refer_obj->where('p_refer_id', $v['p_refer_id'])->update(['refer_frag' => $v['refer_content']]); } else { - Queue::push('app\api\job\ts@fire1', $v, 'ts'); + + //修改队列兼容对接OPENAI接口 chengxiaoling 20251128 start + // Queue::push('app\api\job\ts@fire1', $v, 'ts'); + Queue::push('app\api\job\ArticleReferDetailQueue@fire', $v, 'ArticleReferDetailQueue'); + //修改队列兼容对接OPENAI接口 chengxiaoling 20251128 end } } return jsonSuccess([]); diff --git a/application/api/controller/Workbench.php b/application/api/controller/Workbench.php index 651f516..ec06353 100644 --- a/application/api/controller/Workbench.php +++ b/application/api/controller/Workbench.php @@ -463,4 +463,425 @@ class Workbench extends Base } return json_encode(['status' => 1,'msg' => 'success','data' => $aData]); } + /** + * 获取审稿权限 + * @param art_rev_id 审稿记录ID + * @param + */ + public function getReviewerAuth(){ + + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + $aData = ['is_review_auth' => 2];//审稿权限1是2否 + //获取审稿记录ID + $iArtRevId = empty($aParam['art_rev_id']) ? 0 : $aParam['art_rev_id']; + if(empty($iArtRevId)){ + return json_encode(['status' => 2,'msg' => 'Please select a record','data' => $aData]); + } + //获取账号 + $sAccount = empty($aParam['account']) ? '' : $aParam['account']; + if(empty($sAccount)){ + return json_encode(['status' => 2,'msg' => 'Please enter your account','data' => $aData]); + } + //查询用户是否存在 + $aWhere = ['account' => $sAccount,'state' => 0]; + $aUser = Db::name('user')->field('user_id,account,email')->where($aWhere)->find(); + if(empty($aUser)){ + return json_encode(['status' => 3,'msg' => 'Account does not exist','data' => $aData]); + } + $iUserId = $aUser['user_id']; + + //查询审稿记录 + $aWhere = ['art_rev_id' => $iArtRevId]; + $aArticleReviewer = Db::name('article_reviewer')->where($aWhere)->find(); + if(empty($aArticleReviewer)){ + return json_encode(['status' => 4,'msg' => 'Review record does not exist','data' => $aData]); + } + $aData['review'] = $aArticleReviewer; + //获取文章信息 + $aWhere = ['article_id' => $aArticleReviewer['article_id']]; + $aArticle = Db::name('article')->field('article_id,abstrart,title article_title,type,accept_sn,journal_id,state')->where($aWhere)->find(); + if(empty($aArticle)){ + return json_encode(['status' => 5,'msg' => 'The article does not exist','data' => $aData]); + } + $aArticle['type_name'] = translateType($aArticle['type']);//文章类型 + //查询期刊信息 + $aWhere = ['journal_id' => $aArticle['article_id'],'state' => 0]; + $aJournal = Db::name('journal')->field('title as journal_name,website,email as journal_email')->find(); + if(!empty($aJournal)){ + $aArticle += $aJournal; + } + //判断是否有权限审稿 + $aData['article'] = $aArticle; + if($aArticleReviewer['reviewer_id'] != $iUserId){ + return json_encode(['status' => 6,'msg' => 'No review permission','data' => $aData]); + } + //判断审稿权限 + if($aArticle['state'] != 2){ + return json_encode(['status' => 7,'msg' => 'The article has not entered the review status','data' => $aData]); + } + //审稿拒绝 + if($aArticleReviewer['state'] == 2){ + //获取审稿答卷 + $aWhere = ['art_rev_id' => $iArtRevId,'state' => 0]; + $aQuestion = Db::name('article_reviewer_question')->field('art_rev_id,recommend')->where($aWhere)->find(); + if(empty($aQuestion)){ + return json_encode(['status' => 8,'msg' => 'You have declined the review','data' => $aData]); + } + } + //审稿已过期 + if($aArticleReviewer['state'] == 4){ + return json_encode(['status' => 13,'msg' => 'The review has expired','data' => $aData]); + } + + //同意/1小改后接收/接收 + //判断是否为邮件 + $iIsCode = 2;//是否邮件进入 + $sAct = empty($aParam['act']) ? '' : $aParam['act']; + $aWhere = ['code' => $sAct,'state' => 0]; + if(!empty($sAct)){ + //查询绑定的用户ID + $aCode = Db::name('login_auto')->field('user_id')->where($aWhere)->find(); + if(empty($aCode)){ + return json_encode(['status' => 10,'msg' => 'Code is illegal','data' => $aData]); + } + if($aCode['user_id'] != $iUserId){ + return json_encode(['status' => 11,'msg' => 'Illegal code operation','data' => $aData]); + } + $iIsCode = 1; + } + //当前时间 + $iNowTime = time(); + // 14天 = 14*24*3600 秒 = 1209600 秒 + $iFourteenDays = 14 * 24 * 3600; + //五天 + $iFiveDays = 5 * 24 * 3600; + //审稿邀请 + if($aArticleReviewer['state'] == 5){ + if($iIsCode == 1){//邮件进入未同意审稿直接同意 + //添加时间 + $iTime = empty($aArticleReviewer['ctime']) ? 0 : $aArticleReviewer['ctime']; + if (!is_numeric($iTime) || (int)$iTime <= 0) { + return json_encode([ + 'status' => 12, + 'msg' => 'Invalid record time, the review period has expired', + 'data' => $aData + ]); + } + //判断是否超过5天 + $timeDiff = $iTime+$iFiveDays; + if($timeDiff < $iNowTime){ + //执行审稿过期 + $aWhere = ['art_rev_id' => $iArtRevId,'state' => 5]; + $result = Db::name('article_reviewer')->where($aWhere)->limit(1)->update(['state' => 4]); + return json_encode(['status' => 13,'msg' => 'The number of days for agreeing to review has exceeded 5','data' => $aData]); + } + // var_dump(date('Y-m-d H:i:s',$timeDiff),date('Y-m-d H:i:s',$iTime),date('Y-m-d H:i:s',$iNowTime)); + //执行同意审稿 + $aWhere = ['art_rev_id' => $iArtRevId,'state' => 5]; + $result = Db::name('article_reviewer')->where($aWhere)->limit(1)->update(['state' => 0,'agree_review_time' => time()]); + } + if($iIsCode != 1){ + return json_encode(['status' => 14,'msg' => 'Reviewer did not agree to review','data' => $aData]); + } + } + //同意审稿 + if($aArticleReviewer['state'] == 0){ + //同意审稿的时间 + $iTime = empty($aArticleReviewer['agree_review_time']) ? 0 : $aArticleReviewer['agree_review_time']; + //添加时间 + $iCtime = empty($aArticleReviewer['ctime']) ? 0 : $aArticleReviewer['ctime']; + $iTime = empty($iTime) ? intval($iCtime) : intval($iTime); + if (!is_numeric($iTime) || (int)$iTime <= 0) { + return json_encode([ + 'status' => 15, + 'msg' => 'Invalid record time, the review period has expired', + 'data' => $aData + ]); + } + //判断是否超过14天 + $timeDiff = $iTime+$iFourteenDays; + if($timeDiff < $iNowTime){ + return json_encode(['status' => 16,'msg' => 'The number of days for agreeing to review has exceeded 14','data' => $aData]); + } + $aData['is_review_auth'] = 1; + return json_encode(['status' => 1,'msg' => 'success','data' => $aData]); + } + $aData['is_review_auth'] = 1; + return json_encode(['status' => 1,'msg' => 'success','data' => $aData]); + } + /** + * 审稿人邮件链接失效-重新申请邮件 + */ + public function applySendEmail(){ + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + //获取审稿记录ID + $iArtRevId = empty($aParam['art_rev_id']) ? 0 : $aParam['art_rev_id']; + if(empty($iArtRevId)){ + return json_encode(['status' => 2,'msg' => 'Please select a record']); + } + //获取账号 + $sAccount = empty($aParam['account']) ? '' : $aParam['account']; + if(empty($sAccount)){ + return json_encode(['status' => 2,'msg' => 'Please enter your account']); + } + //查询用户是否存在 + $aWhere = ['account' => $sAccount,'state' => 0]; + $aUser = Db::name('user')->field('user_id')->where($aWhere)->find(); + if(empty($aUser)){ + return json_encode(['status' => 3,'msg' => 'Account does not exist']); + } + $iUserId = $aUser['user_id']; + + //查询审稿记录 + $aWhere = ['art_rev_id' => $iArtRevId]; + $aArticleReviewer = Db::name('article_reviewer')->field('art_rev_id,reviewer_id,article_id,state')->where($aWhere)->find(); + if(empty($aArticleReviewer)){ + return json_encode(['status' => 4,'msg' => 'Review record does not exist']); + } + if($aArticleReviewer['state'] != 4){ + return json_encode(['status' => 5,'msg' => 'The review link has not expired and no application is required']); + } + + //获取文章信息 + $aWhere = ['article_id' => $aArticleReviewer['article_id']]; + $aArticle = Db::name('article')->field('article_id,abstrart,title article_title,type,accept_sn,journal_id,state')->where($aWhere)->find(); + if(empty($aArticle)){ + return json_encode(['status' => 6,'msg' => 'The article does not exist']); + } + if($aArticle['state'] != 2){ + return json_encode(['status' => 7,'msg' => 'The article is not in the review status']); + } + + //查询期刊信息 + $aWhere = ['journal_id' => $aArticle['article_id'],'state' => 0]; + $aJournal = Db::name('journal')->field('title as journal_name,website')->find(); + //查询期刊信息 + if(empty($aArticle['journal_id'])){ + return json_encode(array('status' => 8,'msg' => 'The article is not associated with a journal' )); + } + $aWhere = ['state' => 0,'journal_id' => $aArticle['journal_id']]; + $aJournal = Db::name('journal')->where($aWhere)->find(); + if(empty($aJournal)){ + return json_encode(array('status' => 9,'msg' => 'No journal information found' )); + } + + //查询编辑邮箱 + $iUserId = empty($aJournal['editor_id']) ? 0 : $aJournal['editor_id']; + if(empty($iUserId)){ + return json_encode(array('status' => 10,'msg' => 'The journal to which the article belongs has not designated a responsible editor' )); + } + + //查询审稿人跟编辑的信息 + $aUserId = [$aArticleReviewer['reviewer_id'],$iUserId]; + $aWhere = ['user_id' => ['in',$aUserId],'state' => 0,'email' => ['<>','']]; + $aUser = Db::name('user')->field('user_id,email,realname,account')->where($aWhere)->select(); + if(empty($aUser)){ + return json_encode(['status' => 11,'msg' => "Reviewer and editor information not found"]); + } + $aUser = array_column($aUser, null,'user_id'); + + //更新审稿人重新申请状态为 + $aWhere = ['art_rev_id' => $iArtRevId,'state' => 4]; + $result = Db::name('article_reviewer')->where($aWhere)->limit(1)->update(['is_reapply' => 1,'reapply_time' => time(),'reviewer_act' => 1]); + if($result === false){ + return json_encode(array('status' => 11,'msg' => 'Application to reopen link failed' )); + } + //处理发邮件 + //邮件模版 + $aEmailConfig = [ + 'email_subject' => 'Request to Reopen Expired Review Link---{accept_sn}', + 'email_content' => ' + Dear Editor,

+ The reviewer would like to reopen the expired review link for the manuscript. Below are the details:
+ Reviewer Information:
+ Real Name:{realname}
+ Email:{email}

+ Sincerely,
Editorial Office
+ Subscribe to this journal
{journal_title}
+ Email: {journal_email}
+ Website: {website}' + ]; + //邮件内容 + $aSearch = [ + '{accept_sn}' => empty($aArticle['accept_sn']) ? '' : $aArticle['accept_sn'],//accept_sn + '{journal_title}' => empty($aJournal['title']) ? '' : $aJournal['title'],//期刊名 + '{journal_issn}' => empty($aJournal['issn']) ? '' : $aJournal['issn'], + '{journal_email}' => empty($aJournal['email']) ? '' : $aJournal['email'], + '{website}' => empty($aJournal['website']) ? '' : $aJournal['website'], + '{realname}' => empty($aUser[$aArticleReviewer['reviewer_id']]['realname']) ? '' : $aUser[$aArticleReviewer['reviewer_id']]['realname'], + '{email}' => empty($aUser[$aArticleReviewer['reviewer_id']]['email']) ? '' : $aUser[$aArticleReviewer['reviewer_id']]['email'], + ]; + + //发邮件 + //邮箱 + $email = empty($aUser[$iUserId]['email']) ? '' : $aUser[$iUserId]['email']; + if(empty($email)){ + return json_encode(['status' => 8,'msg' => 'Edit email as empty']); + } + $title = str_replace(array_keys($aSearch), array_values($aSearch),$aEmailConfig['email_subject']); + //邮件内容变量替换 + $content = str_replace(array_keys($aSearch), array_values($aSearch), $aEmailConfig['email_content']); + $pre = \think\Env::get('emailtemplete.pre'); + $net = \think\Env::get('emailtemplete.net'); + $net1 = str_replace("{{email}}",trim($email),$net); + $content=$pre.$content.$net1; + //发送邮件 + $memail = empty($aJournal['email']) ? '' : $aJournal['email']; + $mpassword = empty($aJournal['epassword']) ? '' : $aJournal['epassword']; + //期刊标题 + $from_name = empty($aJournal['title']) ? '' : $aJournal['title']; + //邮件队列组装参数 + $aResult = sendEmail($email,$title,$from_name,$content,$memail,$mpassword); + $iStatus = empty($aResult['status']) ? 1 : $aResult['status']; + $iIsSuccess = 2; + $sMsg = empty($aResult['data']) ? '失败' : $aResult['data']; + if($iStatus == 1){ + return json_encode(['status' => 1,'msg' => 'success']); + } + return json_encode(['status' => 8,'msg' => 'fail']); + } + /** + * 编辑审稿人邮件链接失效-重开 + */ + public function updateReviewerState(){ + + //获取参数 + $aParam = empty($aParam) ? $this->request->post() : $aParam; + + //获取审稿记录ID + $iArtRevId = empty($aParam['art_rev_id']) ? 0 : $aParam['art_rev_id']; + if(empty($iArtRevId)){ + return json_encode(['status' => 2,'msg' => 'Please select a record']); + } + //获取账号 + $sAccount = empty($aParam['account']) ? '' : $aParam['account']; + if(empty($sAccount)){ + return json_encode(['status' => 2,'msg' => 'Please enter your account']); + } + //查询用户是否存在 + $aWhere = ['account' => $sAccount,'state' => 0]; + $aUser = Db::name('user')->field('user_id,account,email')->where($aWhere)->find(); + if(empty($aUser)){ + return json_encode(['status' => 3,'msg' => 'Account does not exist']); + } + $iUserId = $aUser['user_id']; + + //查询审稿记录 + $aWhere = ['art_rev_id' => $iArtRevId]; + $aArticleReviewer = Db::name('article_reviewer')->field('art_rev_id,reviewer_id,article_id,state')->where($aWhere)->find(); + if(empty($aArticleReviewer)){ + return json_encode(['status' => 4,'msg' => 'Review record does not exist']); + } + if($aArticleReviewer['state'] != 4){ + return json_encode(['status' => 5,'msg' => 'The review link has not expired and no application is required']); + } + + //获取文章信息 + $aWhere = ['article_id' => $aArticleReviewer['article_id']]; + $aArticle = Db::name('article')->field('article_id,abstrart,title,type,accept_sn,journal_id,state')->where($aWhere)->find(); + if(empty($aArticle)){ + return json_encode(['status' => 6,'msg' => 'The article does not exist']); + } + if($aArticle['state'] != 2){ + return json_encode(['status' => 7,'msg' => 'The article is not in the review status']); + } + + //查询期刊信息 + $aWhere = ['journal_id' => $aArticle['article_id'],'state' => 0]; + $aJournal = Db::name('journal')->field('title as journal_name,website')->find(); + //查询期刊信息 + if(empty($aArticle['journal_id'])){ + return json_encode(array('status' => 8,'msg' => 'The article is not associated with a journal' )); + } + $aWhere = ['state' => 0,'journal_id' => $aArticle['journal_id']]; + $aJournal = Db::name('journal')->where($aWhere)->find(); + if(empty($aJournal)){ + return json_encode(array('status' => 9,'msg' => 'No journal information found' )); + } + + //判断编辑的操作权限 + $iEditorId = empty($aJournal['editor_id']) ? 0 : $aJournal['editor_id']; + if($iEditorId != $iUserId){ + return json_encode(array('status' => 10,'msg' => 'This article is not authorized for operation under the journal you are responsible for' )); + } + + //更新文章状态为邀请 + $aWhere = ['art_rev_id' => $iArtRevId,'state' => 4]; + $result = Db::name('article_reviewer')->where($aWhere)->limit(1)->update(['state' => 5,'ctime' => time(),'editor_act' => 1,'is_reapply' => 2,'update_time' => time(),'reviewer_act' => 0]); + if($result === false){ + return json_encode(array('status' => 11,'msg' => 'Status update failed' )); + } + + //查询审稿人的邮箱 + $aWhere = ['user_id' => $aArticleReviewer['reviewer_id'],'state' => 0,'email' => ['<>','']]; + $aUser = Db::name('user')->field('user_id,email,realname,account')->where($aWhere)->find(); + if(empty($aUser)){ + return json_encode(['status' => 12,'msg' => "Reviewer and editor information not found"]); + } + //处理发邮件 + //邮箱 + $email = empty($aUser['email']) ? '' : $aUser['email']; + if(empty($email)){ + return json_encode(['status' => 13,'msg' => 'Reviewer email as empty']); + } + //邮件模版 + $aEmailConfig = [ + 'email_subject' => 'Invitation to review a manuscript for {journal_title}-[{accept_sn}]', + 'email_content' => ' + Dear Dr. {realname},

+ The manuscript entitled "{article_title}" has been submitted to the journal {journal_title}.The Editor-in-Chief would be most grateful if you could offer an opinion regarding its suitability for publication in the journal {journal_title}.

+ Abstract of the Manuscript:
+ {abstrart}

+ Please let us know if there are any potential conflicts of interest and click the following link to review the manuscript.
+ Click here to accept the invitation to review
+ Your username: {account}
+ Your original password:123456qwe, if you have reset the password, please login with the new one or click the "forgot password".
+ Thank you for your continued support of our journal.

+ Sincerely,
Editorial Office
+ Subscribe to this journal
{journal_title}
+ Email: {journal_email}
+ Website: {website}' + ]; + $aSearch = [ + '{accept_sn}' => empty($aArticle['accept_sn']) ? '' : $aArticle['accept_sn'],//accept_sn + '{article_title}' => empty($aArticle['title']) ? '' : $aArticle['title'],//文章标题 + '{abstrart}' => empty($aArticle['abstrart']) ? '' : $aArticle['abstrart'],//文章摘要 + '{journal_title}' => empty($aJournal['title']) ? '' : $aJournal['title'],//期刊名 + '{journal_issn}' => empty($aJournal['issn']) ? '' : $aJournal['issn'], + '{journal_email}' => empty($aJournal['email']) ? '' : $aJournal['email'], + '{website}' => empty($aJournal['website']) ? '' : $aJournal['website'], + ]; + //用户名 + $realname = empty($aUser['account']) ? '' : $aUser['account']; + $realname = empty($aUser['realname']) ? $realname : $aUser['realname']; + $aSearch['{realname}'] = $realname; + //用户账号 + $aSearch['{account}'] = empty($aUser['account']) ? '' : $aUser['account']; + //审稿链接 + $oArticle = new \app\api\controller\Article; + $aSearch['{creatLoginUrlForreviewer}'] = $oArticle->creatLoginUrlForreviewer(['user_id' => $aArticleReviewer['reviewer_id']],$iArtRevId); + $title = str_replace(array_keys($aSearch), array_values($aSearch),$aEmailConfig['email_subject']); + //邮件内容变量替换 + $content = str_replace(array_keys($aSearch), array_values($aSearch), $aEmailConfig['email_content']); + $pre = \think\Env::get('emailtemplete.pre'); + $net = \think\Env::get('emailtemplete.net'); + $net1 = str_replace("{{email}}",trim($email),$net); + $content=$pre.$content.$net1; + //发送邮件 + $memail = empty($aJournal['email']) ? '' : $aJournal['email']; + $mpassword = empty($aJournal['epassword']) ? '' : $aJournal['epassword']; + //期刊标题 + $from_name = empty($aJournal['title']) ? '' : $aJournal['title']; + //邮件队列组装参数 + $aResult = sendEmail($email,$title,$from_name,$content,$memail,$mpassword); + $iStatus = empty($aResult['status']) ? 1 : $aResult['status']; + $iIsSuccess = 2; + $sMsg = empty($aResult['data']) ? '失败' : $aResult['data']; + if($iStatus == 1){ + return json_encode(['status' => 1,'msg' => 'success']); + } + return json_encode(['status' => 14,'msg' => 'fail']); + } } diff --git a/application/common.php b/application/common.php index ba27249..f4fb304 100644 --- a/application/common.php +++ b/application/common.php @@ -861,7 +861,7 @@ function my_doiToFrag2($data) $suffix = empty($parts[1]) ? 0 : intval($parts[1]); if($prefix > $suffix){ $prefixLen = strlen($prefix); - $suffixLen = strlen($sufix); + $suffixLen = strlen($suffix); $missingLen = $prefixLen - $suffixLen; if ($missingLen > 0) { $fillPart = substr($prefix, 0, $missingLen); diff --git a/application/common/ArticleParserService.php b/application/common/ArticleParserService.php index 9b93a37..5b0dd52 100644 --- a/application/common/ArticleParserService.php +++ b/application/common/ArticleParserService.php @@ -14,7 +14,7 @@ class ArticleParserService { private $phpWord; private $sections; - + private $iNum = 0; public function __construct($filePath = '') { if (!file_exists($filePath)) { @@ -225,6 +225,10 @@ class ArticleParserService $aParam['corresponding'] = $oDealFile->getCorrespondingAuthors($aParam); //keywords 和 摘要 $aContent = $oDealFile->extractFromWord(); + if(!mb_check_encoding($sTitle, 'UTF-8')){ + $sTitle = mb_convert_encoding($sTitle, 'UTF-8', 'GBK'); + } + $aParam['title'] = $oDealFile->fullDecode($aParam['title']); $aParam += empty($aContent['data']) ? [] : $aContent['data']; return json_encode(['status' => 1,'msg' => 'success','data' => $aParam]); } @@ -240,190 +244,25 @@ class ArticleParserService foreach ($section->getElements() as $element) { $text = $this->getTextFromElement($element); $length = mb_strlen(trim($text)); - if ($length > $maxLength && $length > 10) { // 标题通常较长 + if ($length > $maxLength && $length > 3) { // 标题通常较长 $title = trim($text); $maxLength = $length; break 2; // 取第一个最长段落作为标题 } } } - if(!empty($title) && !mb_check_encoding($title, 'UTF-8')){ - $title = mb_convert_encoding($title, 'UTF-8', 'GBK'); - } return $title; } - // 提取作者 - // private function getAuthors($aParam = []) { - // $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; - // $sAuthorContent = $this->getNextParagraphAfterText($title); - // if (empty($sAuthorContent)) { - // return ['author' => [], 'report' => []]; - // } - - // //编码修复 - // $possibleEncodings = [ - // 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - // 'Latin-1', 'ISO-8859-1', 'CP1252' - // ]; - // $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); - // $sAuthorContent = $encodedContent ?: $sAuthorContent; - - // //清理不可见字符 - // $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); - - // //修复特殊符号乱码 - // $symbolMap = [ - // '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', - // ':' => ':', ',' => ',', '—' => '-', - // '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) - // ]; - // $sAuthorContent = strtr($sAuthorContent, $symbolMap); - - // //格式标准化 - // $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 - // $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 - // $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 - // $sAuthorContent = trim($sAuthorContent); - - // // 处理作者 - // $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 - // $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 - // $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" - // $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) - // //标记上标内的逗号+空格(多编号) - // $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); - // // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) - // $pattern = '/ - // ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) - // \s* # 姓名与上标间空格 - // ( # 上标组(扩展符号支持) - // \d+ # 起始数字 - // (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) - // ) - // \s*,? # 作者间逗号(可选) - // (?=\s|$) # 确保后面是空格或结尾 - // /ux'; - - // preg_match_all($pattern, $tempStr, $matches); - // $authorList = []; - // if(!empty($matches[1])){ - // foreach ($matches[1] as $i => $name) { - // $name = trim($name); - // $superscript = trim($matches[2][$i]); - // $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 - // $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 - // // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) - // $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); - // if (!empty($name)) { - // $authorList[] = [ - // 'name' => $name, - // 'superscript' => $superscript - // ]; - // } - // } - // }else { - // // 按“两个或多个连续空格”拆分(姓名之间的分隔) - // $authorList = array_filter( - // array_map('trim', - // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) - // ) - // ); - // } - - - // // //处理作者 - // // $authorList = []; - // // // 新正则:匹配“姓名+上标”整体,允许上标含逗号(如1,†) - // // // 逻辑:姓名以字母/中文开头,上标以数字开头、以符号/数字结尾 - // // // if (preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*([\d,†#*]+)/u', $sAuthorContent, $matches)) { - // // if(preg_match_all('/([A-Za-z\x{4e00}-\x{9fa5}][A-Za-z\s·\-\'\x{4e00}-\x{9fa5}]*)\s*(\d[\d,†#\s*]*)/u', $sAuthorContent, $matches)){ - // // for ($i = 0; $i < count($matches[1]); $i++) { - // // $authorList[] = trim($matches[1][$i] . $matches[2][$i]); - // // } - // // } else { - // // // 按“两个或多个连续空格”拆分(姓名之间的分隔) - // // $authorList = array_filter( - // // array_map('trim', - // // preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) - // // ) - // // ); - // // } - // $aAuthorData = []; - // $aReport = []; - // $namePattern = '/ - // (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) - // [\x{4e00}-\x{9fa5}]+| # 中文姓名 - // [\x{1800}-\x{18AF}]+| # 蒙古文姓名 - // [A-Z]\.) # 单字母缩写(如 J.) - // /ux'; - // var_dump($authorList);exit; - // foreach ($authorList as $authorStr) { - // if (empty($authorStr)) continue; - // var_dump($authorList);exit; - // //分离姓名与上标(支持上标含逗号,如1,†) - // $superscript = ''; - // // 新正则:匹配以数字开头、含逗号/符号的完整上标(如1,†、2*#) - // $authorStr = trim(trim($authorStr,','),' '); - // // if (preg_match('/([\d,†#*]+)$/u', $authorStr, $supMatch)) { - // // if(preg_match('/\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)){ - // // if (preg_match('/.*?\s*([\d,†#* ]+)$/u', $authorStr, $supMatch)) { - // // if (preg_match('/.*?\s*([\d,\x{2020}#* ]+?)\s*$/u', $authorStr, $supMatch)) { - // // if (preg_match('/^(.+?)\D*?(\d[\d,#*†,\s]*)$/u', $authorStr, $supMatch)) { - // // $superscript = $supMatch[1]; - // // // 移除上标,保留纯姓名(避免残留符号) - // // $nameStr = trim(preg_replace('/' . preg_quote($superscript, '/') . '$/', '', $authorStr)); - // // } else { - // // $nameStr = $authorStr; - // // } - // $pattern = '/^(.+?)\s*(\d[\d,#*†\s]*?)\s*$/u'; - // if (preg_match($pattern, $authorStr, $supMatch)) { - // $nameStr = empty($supMatch[1]) ? '' : trim($supMatch[1]); // 姓名部分:"Liguo Zhang" - // $superscript = empty($supMatch[2]) ? $nameStr : $nameStr.trim($supMatch[2]); // 上标部分:"1 - // // echo "姓名: $nameStr, 上标: $superscript\n"; - // } else { - // $nameStr = $authorStr; - // } - // //验证姓名合法性(过滤无效内容) - // if (!preg_match($namePattern, $nameStr)) { - // continue; - // } - // //解析上标信息(正确识别1,†中的机构编号和符号) - // $companyId = ''; - // $isSuper = 0; - // $isReport = 0; - // if (!empty($superscript)) { - // // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) - // if (preg_match('/(\d+)/', $superscript, $numMatch)) { - // $companyId = $numMatch[1]; - // } - // // 识别特殊符号(#为超级作者,*†为通讯作者) - // $isSuper = strpos($superscript, '#') !== false ? 1 : 0; - // $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; - // } - // if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { - // $nameStr = trim($match[1]); - // } - // $aAuthorData[] = [ - // 'name' => $nameStr, - // 'company_id' => $companyId, - // 'is_super' => $isSuper, - // 'is_report' => $isReport - // ]; - // if ($isReport) { - // $aReport[] = $nameStr; - // } - // } - // var_dump($aAuthorData);exit; - // return ['author' => $aAuthorData,'report' => array_unique($aReport)]; - // } // 提取作者 private function parseAuthorsWithoutRegex($str = '') { if (empty($str)) { return []; } - // 清理乱码和特殊字符(扩展全角数字处理) - $str = mb_convert_encoding($str, 'UTF-8', 'auto'); + if(!mb_check_encoding($str, 'UTF-8')){ + $str = mb_convert_encoding($str, 'UTF-8', 'GBK'); + } + $str = $this->fullDecode($str); $str = str_replace(["\xC2\xA0", 'ï¼', '�', ',', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], [' ', ' ', ' ', ' ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], $str); $str = trim(str_replace([' and ', ' AND ', ' And '], ', ', $str)); @@ -584,15 +423,10 @@ class ArticleParserService if (empty($sAuthorContent)) { return ['author' => [], 'report' => []]; } - - //编码修复 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; - $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); - $sAuthorContent = $encodedContent ?: $sAuthorContent; - + if(!mb_check_encoding($sAuthorContent, 'UTF-8')){ + $sAuthorContent = mb_convert_encoding($sAuthorContent, 'UTF-8', 'GBK'); + } + $sAuthorContent = $this->fullDecode($sAuthorContent); //清理不可见字符 $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); @@ -614,14 +448,10 @@ class ArticleParserService return ['author' => [],'report' => []]; } $aReport = $aAuthorData = []; - foreach ($aAuthor as $key => $value) { if(empty($value['name']) && empty($value['superscript'])){ continue; } - if(!mb_check_encoding($value['name'], 'UTF-8')){ - $value['name'] = mb_convert_encoding($value['name'], 'UTF-8', 'GBK'); - } if(!empty($value['name']) && !empty($value['is_report']) && $value['is_report'] == 1){ $aReport[] = $value['name']; } @@ -629,175 +459,6 @@ class ArticleParserService } return ['author' => $aAuthorData,'report' => array_unique($aReport)]; } -// private function getAuthors($aParam = []) { -// $title = empty($aParam['title']) ? $this->getTitle() : $aParam['title']; -// $sAuthorContent = $this->getNextParagraphAfterText($title); -// if (empty($sAuthorContent)) { -// return ['author' => [], 'report' => []]; -// } - -// //编码修复 -// $possibleEncodings = [ -// 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', -// 'Latin-1', 'ISO-8859-1', 'CP1252' -// ]; -// $encodedContent = @mb_convert_encoding($sAuthorContent, 'UTF-8', implode(',', $possibleEncodings)); -// $sAuthorContent = $encodedContent ?: $sAuthorContent; - -// //清理不可见字符 -// $sAuthorContent = preg_replace('/[\x00-\x1F\x7F\x{200B}-\x{200F}]/u', '', $sAuthorContent); - -// //修复特殊符号乱码 -// $symbolMap = [ -// '†' => '†', 'â ' => '†', 'â' => '†', '?†' => '†', -// ':' => ':', ',' => ',', '—' => '-', -// '啊' => '' // 针对性移除异常字符“啊”(若为固定乱码) -// ]; -// $sAuthorContent = strtr($sAuthorContent, $symbolMap); - -// //格式标准化 -// $sAuthorContent = str_replace([',', ';', ';', '、'], ',', $sAuthorContent); // 统一分隔符 -// $sAuthorContent = preg_replace('/\s+and\s+/i', ', ', $sAuthorContent); // and转逗号 -// $sAuthorContent = preg_replace('/\s+/', ' ', $sAuthorContent); // 合并多余空格 -// $sAuthorContent = trim($sAuthorContent); -// var_dump($this->parseAuthorsWithoutRegex($sAuthorContent));exit; -// // 关键预处理:兼容"and"分隔符、清理乱码、统一空格 -// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); -// $content = str_replace(["\xC2\xA0", 'ï¼', '�', ','], ' ', $content); // 清理乱码和全角符号 -// $content = preg_replace('/\band\b/i', ',', $content); // 将 "and" 转为逗号(统一分隔符) -// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并数字与符号间的空格(如"1 *"→"1*") -// $content = trim(preg_replace('/\s+/', ' ', $content)); // 合并连续空格 - -// // 标记上标内的逗号(多编号处理) -// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); - -// // 核心正则(保持原有结构,扩展符号支持) -// $pattern = '/ -// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格、连字符) -// \s* # 姓名与上标间的空格(允许0或多个) -// ( # 上标组(扩展兼容所有符号) -// \d+ # 起始数字(至少1个数字) -// (?:[†#*,]|\d+)* # 允许:符号(†#*)、逗号、+数字(多编号) -// ) -// \s*,? # 作者间的逗号(可选,允许逗号前有空格) -// (?=\s|$) # 确保后面是空格或字符串结尾(避免跨作者匹配) -// /ux'; - -// preg_match_all($pattern, $tempStr, $matches); - -// // 解析结果并格式化 -// $authorList = []; -// if (!empty($matches[1])) { -// foreach ($matches[1] as $i => $name) { -// $name = trim($name); -// $superscript = trim($matches[2][$i]); -// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 -// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾多余逗号 -// if (!empty($name)) { -// $authorList[] = [ -// 'name' => $name, -// 'superscript' => $superscript -// ]; -// } -// } -// } - -// // 输出结果 -// echo "
";
-// print_r($authorList);
-// echo "
"; -// exit; - -// // 处理作者 -// $content = mb_convert_encoding($sAuthorContent, 'UTF-8', 'auto'); // 确保编码正确 -// $content = str_replace("\xC2\xA0", ' ', $content); // 替换非-breaking空格为普通空格 -// $content = preg_replace('/(\d+)\s*([*#†])/', '$1$2', $content); // 合并"1 *"为"1*"、"1 #"为"1#" -// $content = preg_replace('/,†/', ',†', $content); // 保留"1,†"格式(防止被拆分) - -// //标记上标内的逗号+空格(多编号) -// $tempStr = preg_replace('/(\d+)\s*,\s*(\d+)/', '$1$2', $content); -// // 原有步骤2:正则匹配(扩展上标符号支持,保持原有逻辑) -// $pattern = '/ -// ([A-Za-z\s\.\-]+?) # 姓名(支持缩写、空格) -// \s* # 姓名与上标间空格 -// ( # 上标组(扩展符号支持) -// \d+ # 起始数字 -// (?:[†#*,]|\d+)* # 允许:†#*符号、逗号、+数字(兼容1,†、1,*等) -// ) -// \s*,? # 作者间逗号(可选) -// (?=\s|$) # 确保后面是空格或结尾 -// /ux'; - -// preg_match_all($pattern, $tempStr, $matches); -// var_dump($matches);exit; -// $authorList = []; -// if(!empty($matches[1])){ -// foreach ($matches[1] as $i => $name) { -// $name = trim($name); -// $superscript = trim($matches[2][$i]); -// $superscript = str_replace('', ',', $superscript); // 恢复多编号逗号 -// $superscript = preg_replace('/,$/', '', $superscript); // 清理末尾逗号 -// // 修复符号与数字间的空格(如原始"1 *"被误处理为"1*"的情况,保持原样) -// $superscript = preg_replace('/(\d)([*#†])/', '$1$2', $superscript); -// if (!empty($name)) { -// $authorList[] = [ -// 'name' => $name, -// 'superscript' => $superscript -// ]; -// } -// } -// }else { -// // 按“两个或多个连续空格”拆分(姓名之间的分隔) -// $authorList = array_filter( -// array_map('trim', -// preg_split('/(,\p{Z}*|\p{Z}{2,})/u', $sAuthorContent) -// ) -// ); -// } - - -// // //处理作者 -// $aAuthorData = []; -// $aReport = []; -// $namePattern = '/ -// (?:[A-Za-z\s·\-\']+| # 英文姓名(支持空格、连字符) -// [\x{4e00}-\x{9fa5}]+| # 中文姓名 -// [\x{1800}-\x{18AF}]+| # 蒙古文姓名 -// [A-Z]\.) # 单字母缩写(如 J.) -// /ux'; - -// foreach ($authorList as $authorStr){ -// if (empty($authorStr)) continue; - -// //获取下标 -// $superscript = empty($authorStr['superscript']) ? $authorStr : $authorStr['superscript']; -// $nameStr = empty($authorStr['name']) ? $authorStr : $authorStr['name']; - -// $companyId = []; -// $isSuper = 0; -// $isReport = 0; -// if (!empty($superscript)) { -// // 提取机构编号(忽略上标中的逗号,如1,† → 提取1) -// preg_match_all('/\d+/', $superscript, $numMatch); -// // 识别特殊符号(#为超级作者,*†为通讯作者) -// $isSuper = strpos($superscript, '#') !== false ? 1 : 0; -// $isReport = (strpos($superscript, '*') !== false || strpos($superscript, '†') !== false) ? 1 : 0; -// } -// if (preg_match("/^([A-Za-z\s'\.-]+)/u", $nameStr, $match)) { -// $nameStr = trim($match[1]); -// } -// $aAuthorData[] = [ -// 'name' => $nameStr, -// 'company_id' => empty($numMatch[0]) ? [] : $numMatch[0], -// 'is_super' => $isSuper, -// 'is_report' => $isReport -// ]; -// if ($isReport) { -// $aReport[] = $nameStr; -// } -// } -// return ['author' => $aAuthorData,'report' => array_unique($aReport)]; -// } // 获取机构 private function getCompany($aParam = []){ @@ -815,16 +476,39 @@ class ArticleParserService $currentNumber = null; // 当前序号 foreach ($allLines as $line) { $line = trim($line); - if (empty($line)) continue; - - // 判断是否是新条目的开头:行首为数字(后续可接任意字符或直接接内容) + if (empty($line)) { + continue; + } + if(!mb_check_encoding($line, 'UTF-8')){ + $line = mb_convert_encoding($line, 'UTF-8', 'GBK'); + } + $line = $this->fullDecode($line); $number = ''; $i = 0; $lineLen = strlen($line); // 提取行首的连续数字(作为序号) - while ($i < $lineLen && ctype_digit($line[$i])) { - $number .= $line[$i]; - $i++; + $hasFirstChar = false; + while ($i < $lineLen) { + $currentChar = $line[$i]; + // 首字符处理:允许 26个字母(大小写)或数字 + if (!$hasFirstChar) { + if (ctype_digit($currentChar) || ctype_alpha($currentChar)) { + $number .= $currentChar; + $hasFirstChar = true; + $i++; + } else { + // 首字符不符合(非字母/数字),终止循环 + break; + } + } else { + // 后续字符必须是数字(保持原逻辑) + if (ctype_digit($currentChar)) { + $number .= $currentChar; + $i++; + } else { + break; + } + } } // 若行首有数字,则视为新条目 @@ -840,34 +524,36 @@ class ArticleParserService continue; } - // 非新条目,合并到当前序号的内容中 - if ($currentNumber !== null) { - $grouped[$currentNumber] .= ' ' . $line; - } + // // 非新条目,合并到当前序号的内容中 + // if ($currentNumber !== null) { + // $grouped[$currentNumber] .= ' ' . $line; + // } } - //清理结果 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; $aCompany = []; foreach ($grouped as $number => $institution) { - $encodedContent = @mb_convert_encoding($institution, 'UTF-8', implode(',', $possibleEncodings)); - $sCompany = $encodedContent ?: $sCompany; + $institution = $this->fullDecode($institution); + // 原有基础清理逻辑不变 $institution = preg_replace('/\s+/', ' ', $institution); // 合并多余空格 - $institution = rtrim($institution, '.'); - $institution = preg_replace('/^\d+\s+/', '', $institution); + $institution = rtrim($institution, '.'); // 去除末尾句号 + $institution = preg_replace('/^\d+\s+/', '', $institution); // 去除开头数字 $institution = trim($institution); // 清理首尾空格 - preg_match('/(.*?, [A-Za-z]+ \d+, [A-Za-z]+)/', $institution, $institutionmatches);; - $institution = trim($institutionmatches[1] ?? $institution); - if (preg_match('/^(.*?)(?=\s*\*Email)/', $institution, $matches)) { - $institution = trim($matches[1]); // trim() 去除内容前后多余空格 + + // 增强地址提取:匹配"机构名, 城市 邮编, 国家"格式(兼容更多变体) + // 允许地址中包含多个逗号(如子机构、街道信息),最终以"城市 邮编, 国家"结尾 + // preg_match('/(.*?, [A-Za-z\s]+ \d+, [A-Za-z\s]+)/', $institution, $institutionmatches); + // $institution = trim($institutionmatches[1] ?? $institution); + // 强化冗余信息过滤:去除"*"及之后的内容(包括通讯作者、邮箱等) + // 新增对"#"、"†"等标记的过滤,兼容更多期刊格式 + if (preg_match('/^(.*?)(?=\s*[\*#†]|(?i)\s*Email)/', $institution, $matches)) { + $institution = trim($matches[1]); } - if(!empty($institution) && !mb_check_encoding($institution, 'UTF-8')){ + + // 编码校验不变 + if (!empty($institution) && !mb_check_encoding($institution, 'UTF-8')) { $institution = mb_convert_encoding($institution, 'UTF-8', 'GBK'); } - $aCompany[$number] = $institution; + $aCompany[$number] = empty($institution) ? '' : trim(trim($institution),'.'); } return $aCompany; } @@ -891,13 +577,11 @@ class ArticleParserService // 获取机构后的完整内容 $corrText = $this->getContentAfterText($sCompany); - //编码修复 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; - $encodedContent = @mb_convert_encoding($corrText, 'UTF-8', implode(',', $possibleEncodings)); - $corrText = $encodedContent ?: $corrText; + if(!mb_check_encoding($corrText, 'UTF-8')){ + $corrText = mb_convert_encoding($corrText, 'UTF-8', 'GBK'); + } + $corrText = $this->fullDecode($corrText); + // // 调试 // file_put_contents(ROOT_PATH . 'runtime/corr_text_raw.log', $corrText); @@ -922,23 +606,35 @@ class ArticleParserService $aCorresponding[] = [ 'name' => $sName, 'email' => isset($email[2]) ? trim($email[2]) : '', - 'postal_address' => isset($address[2]) ? trim($address[2]) : '', + 'postal_address' => isset($address[2]) ? trim(trim($address[2]),'.') : '', 'tel' => isset($tel[2]) ? trim($tel[2]) : '' ]; } if(empty($aCorresponding)){ - $pattern = '/Corresponding Authors: (.*?)(?=$|;)/s'; + // $pattern = '/Corresponding Authors|Correspondence to|Correspondence: (.*?)(?=$|;)/s'; + $pattern = '/(Corresponding Authors|Correspondence to|Correspondence)\s*:\s*([\s\S]+?)(?=\n\s*\n|$|;)/is'; + $corrText = trim($corrText,'*'); preg_match($pattern, $corrText, $match); - if (!empty($match[1])) { - $corrContent = $match[1]; + if (!empty($match[2])) { + $corrContent = $match[2]; // 提取每个作者的名称和邮箱(优化正则,支持更多字符) $authorPattern = '/([A-Za-z\s]+?),\s*E-mail:\s*([\w@\.\-]+)/'; preg_match_all($authorPattern, $corrContent, $authors); if(!empty($authors[1])){ for ($i = 0; $i < count($authors[1]); $i++) { $aCorresponding[] = [ - 'name' => empty($authors[1][$i]) ? '' : trim($authors[1][$i]), - 'email' => empty($authors[2][$i]) ? '' : trim($authors[2][$i]) + 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'), + 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.') + ]; + } + } + if(empty($authors[1])){ + $authorPattern = '/([A-Za-z0-9\s]+?),\s*([\w@\.\-]+)(?=\.?)/'; + preg_match_all($authorPattern, $corrContent, $authors); + for ($i = 0; $i < count($authors[1]); $i++) { + $aCorresponding[] = [ + 'name' => empty($authors[1][$i]) ? '' : trim(trim($authors[1][$i]),'.'), + 'email' => empty($authors[2][$i]) ? '' : trim(trim($authors[2][$i]),'.') ]; } } @@ -1040,106 +736,379 @@ class ArticleParserService } // 统一提取元素文本 - private function getTextFromElement($element,$lineNumber = 0){ + private function getTextFromElement(\PhpOffice\PhpWord\Element\AbstractElement $element, int $lineNumber = 0){ $text = ''; - // 处理PreserveText元素 + + // 1. 常量化特殊引号映射(避免每次调用重建数组,提升循环调用性能) + static $specialQuotesMap = [ + '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027) + '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027) + '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022) + '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022) + '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版) + '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版) + ]; + + // 支持H1-H9标题格式(优化:移除无用变量 $titleDepth,避免冗余) + if ($element instanceof \PhpOffice\PhpWord\Element\Title) { + $titleContent = $element->getText(); + $titleText = ''; + + if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) { + $titleText = $this->getTextFromElement($titleContent); + } else { + $titleText = strtr((string)$titleContent, $specialQuotesMap); + } + + $text .= $titleText . ' '; + return $this->cleanText($text); + } + + // 项目编号(优化:严格空值判断,避免 0 被 empty 误判) + if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) { + $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0; + $this->iNum++; + $text .= $this->iNum . ' '; + } + + // 处理PreserveText(含HYPERLINK邮箱提取,优化:反射前先判断属性存在) if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { - // 通过反射获取私有属性 text - $reflection = new \ReflectionClass($element); - $property = $reflection->getProperty('text'); - $property->setAccessible(true); - $textParts = $property->getValue($element); + try { + $reflection = new \ReflectionClass($element); + // 先判断属性是否存在,避免反射不存在的属性报错(兼容极端版本) + if (!$reflection->hasProperty('text')) { + return $this->cleanText($text); + } + $property = $reflection->getProperty('text'); + $property->setAccessible(true); + $textParts = $property->getValue($element) ?? []; + } catch (\ReflectionException $e) { + return $this->cleanText($text); + } + foreach ($textParts as $part) { + $part = (string)$part; if (strpos($part, 'HYPERLINK') !== false) { - // 解码 HTML 实体(" -> ") - $decoded = html_entity_decode($part); - // 提取 mailto: 后的邮箱 - if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/i', $decoded, $match)) { + $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5); + // 邮箱正则不变(已优化,兼容国际域名) + if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) { $text .= $match[1] . ' '; } } else { - // 普通文本直接拼接 + $part = strtr($part, $specialQuotesMap); $text .= $part; } } - return $text; + return $this->cleanText($text); } - // 处理表格和单元格(E-mail可能在表格中) + + // 处理表格(优化:避免行尾多余空格,通过 cleanText 自动合并) if ($element instanceof \PhpOffice\PhpWord\Element\Table) { foreach ($element->getRows() as $row) { foreach ($row->getCells() as $cell) { - $text .= $this->getTextFromElement($cell); + $text .= $this->getTextFromElement($cell) . ' '; } + // 移除行尾额外空格(cleanText 会合并连续空格,无需手动添加) } - return $text; + return $this->cleanText($text); } + + // 处理单元格(逻辑不变,保持递归提取) if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { foreach ($element->getElements() as $child) { $text .= $this->getTextFromElement($child); } - return $text; + return $this->cleanText($text); } - //处理嵌套元素(递归提取所有子元素) - if (method_exists($element, 'getElements')) { + // 处理嵌套元素(逻辑不变,增强类型校验可读性) + if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) { foreach ($element->getElements() as $child) { - $text .= $this->getTextFromElement($child); + if ($child instanceof \PhpOffice\PhpWord\Element\AbstractElement) { + $text .= $this->getTextFromElement($child); + } } } - //处理文本元素(包括带格式的文本) + // 处理纯文本元素(逻辑不变,保持特殊引号替换) if ($element instanceof \PhpOffice\PhpWord\Element\Text) { - $text .= $element->getText(); + $textPart = (string)$element->getText(); // 显式强制转换,避免类型隐患 + $textPart = strtr($textPart, $specialQuotesMap); + $text .= $textPart; } - //处理超链接(优先提取链接目标,可能是邮箱) + // 处理超链接(逻辑不变,保持邮箱优先提取) if ($element instanceof \PhpOffice\PhpWord\Element\Link) { - $target = $element->getTarget(); + $target = (string)$element->getTarget(); if (strpos($target, 'mailto:') === 0) { - $text .= str_replace('mailto:', '', $target) . ' '; // 剥离mailto:前缀 + $text .= rtrim(str_replace('mailto:', '', $target)) . ' '; } - $text .= $element->getText() . ' '; + $linkText = strtr((string)$element->getText(), $specialQuotesMap); + $text .= $linkText . ' '; } - //处理字段和注释(可能包含隐藏邮箱) + // 处理字段和注释(优化:显式强制转换,避免非字符串拼接) if ($element instanceof \PhpOffice\PhpWord\Element\Field) { - $text .= $element->getContent() . ' '; + $text .= (string)$element->getContent() . ' '; } if ($element instanceof \PhpOffice\PhpWord\Element\Note) { - $text .= $element->getContent() . ' '; + $text .= (string)$element->getContent() . ' '; } - //清理所有不可见字符(关键:移除格式干扰) - $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/', ' ', $text); // 移除控制字符 - $text = str_replace(["\t", "\r", "\n"], ' ', $text); // 统一空白字符 - $text = preg_replace('/\s+/', ' ', $text); // 合并多个空格 - if(!empty($text) && !mb_check_encoding($text, 'UTF-8')){ - $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); - } - return $text; + + return $this->cleanText($text); } + /** + * 统一文本清理方法(稳健、高效、不破坏普通单引号) + * @param string $text 待清理文本 + * @return string 清理后的纯文本 + */ + private function cleanText(string $text){ + + //编码正确 + if (!mb_check_encoding($text, 'UTF-8')) { + $text = mb_convert_encoding( + $text, + 'UTF-8', + 'GBK,GB2312,GB18030,Big5,ISO-8859-1,CP1252,UTF-16,UTF-32' // 补充常见西文编码,兼容更多场景 + ); + } + //移除不可见控制字符 + $text = preg_replace('/[\x00-\x1F\x7F-\x9F]/u', ' ', $text); + + //统一空白字符 + $text = str_replace([ + "\t", "\r", "\n", + chr(0xC2) . chr(0xA0), // 不间断空格( ) + ' ', // 全角空格(U+3000) + chr(0xE2) . chr(0x80) . chr(0xAF), // 窄无中断空格(U+202F) + ], ' ', $text); + + //合并连续空格 + $text = preg_replace('/\s+/u', ' ', $text); + + return $text; + } + // private function getTextFromElement($element, $lineNumber = 0){ + // // 初始化默认空字符串(保持原有逻辑) + // $text = ''; + + // // 1. 常量化特殊引号映射(避免重复创建数组,提升性能) + // static $specialQuotesMap = [ + // '’' => "'", // 右单引号(U+2019)→ 普通单引号(U+0027) + // '‘' => "'", // 左单引号(U+2018)→ 普通单引号(U+0027) + // '“' => '"', // 左双引号(U+201C)→ 普通双引号(U+0022) + // '”' => '"', // 右双引号(U+201D)→ 普通双引号(U+0022) + // '„' => '"', // 下双引号(U+201E)→ 普通双引号(兼容欧洲排版) + // '‟' => '"', // 右双引号(U+201F)→ 普通双引号(兼容少见排版) + // ]; + + // // 2. 提前校验元素合法性(避免后续 instanceof 无效判断,减少报错) + // if (!is_object($element) || !$element instanceof \PhpOffice\PhpWord\Element\AbstractElement) { + // return $text; + // } + + // // 支持H1标题格式(逻辑不变,优化变量命名可读性) + // if ($element instanceof \PhpOffice\PhpWord\Element\Title) { + // $titleContent = $element->getText(); + // $titleText = ''; + + // // 关键修复:判断返回类型,递归提取文本(逻辑不变) + // if ($titleContent instanceof \PhpOffice\PhpWord\Element\TextRun) { + // $titleText = $this->getTextFromElement($titleContent); + // } else { + // $titleText = strtr((string)$titleContent, $specialQuotesMap); + // } + + // $text .= $titleText . ' '; + // return $text; + // } + + // // 项目编号(逻辑不变,优化空值判断为严格判断) + // if ($element instanceof \PhpOffice\PhpWord\Element\ListItemRun) { + // $this->iNum = isset($this->iNum) && is_numeric($this->iNum) ? $this->iNum : 0; + // $this->iNum++; + // $text .= $this->iNum . ' '; + // } + + // // 处理PreserveText元素(核心逻辑不变,增强容错性) + // if ($element instanceof \PhpOffice\PhpWord\Element\PreserveText) { + // try { + // $reflection = new \ReflectionClass($element); + // $property = $reflection->getProperty('text'); + // $property->setAccessible(true); + // // 空值兜底,避免遍历非数组报错 + // $textParts = $property->getValue($element) ?? []; + // } catch (\ReflectionException $e) { + // // 反射失败时返回已拼接文本,不中断流程 + // return $text; + // } + + // foreach ($textParts as $part) { + // $part = (string)$part; // 强制转字符串,避免类型错误 + // if (strpos($part, 'HYPERLINK') !== false) { + // $decoded = html_entity_decode($part, ENT_QUOTES | ENT_HTML5); + // // 邮箱正则不变,保持原有匹配逻辑 + // if (preg_match('/mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10})/i', $decoded, $match)) { + // $text .= $match[1] . ' '; + // } + // } else { + // $text .= $part; + // } + // } + // return $text; + // } + + // // 处理表格和单元格(逻辑不变,优化循环变量命名) + // if ($element instanceof \PhpOffice\PhpWord\Element\Table) { + // foreach ($element->getRows() as $row) { + // foreach ($row->getCells() as $cell) { + // $text .= $this->getTextFromElement($cell); + // } + // } + // return $text; + // } + + // if ($element instanceof \PhpOffice\PhpWord\Element\Cell) { + // foreach ($element->getElements() as $child) { + // $text .= $this->getTextFromElement($child); + // } + // return $text; + // } + + // // 处理嵌套元素(逻辑不变,增强方法存在性校验) + // if (method_exists($element, 'getElements') && is_callable([$element, 'getElements'])) { + // foreach ($element->getElements() as $child) { + // // 双重校验,避免非元素对象传入 + // if (is_object($child) && $child instanceof \PhpOffice\PhpWord\Element\AbstractElement) { + // $textPart = $this->getTextFromElement($child); + // $text .= $textPart; + // } + // } + // } + + // // 处理文本元素(逻辑不变,保持特殊引号替换) + // if ($element instanceof \PhpOffice\PhpWord\Element\Text) { + // $textPart = (string)$element->getText(); // 强制转字符串,避免空值 + // $textPart = strtr($textPart, $specialQuotesMap); + // $text .= $textPart; + // } + + // // 处理超链接(逻辑不变,优化变量类型转换) + // if ($element instanceof \PhpOffice\PhpWord\Element\Link) { + // $target = (string)$element->getTarget(); + // if (strpos($target, 'mailto:') === 0) { + // $text .= rtrim(str_replace('mailto:', '', $target)) . ' '; + // } + // $linkText = strtr((string)$element->getText(), $specialQuotesMap); + // $text .= $linkText . ' '; + // } + + // // 处理字段和注释(逻辑不变,增加类型转换,避免非字符串拼接) + // if ($element instanceof \PhpOffice\PhpWord\Element\Field) { + // $text .= (string)$element->getContent() . ' '; + // } + // if ($element instanceof \PhpOffice\PhpWord\Element\Note) { + // $text .= (string)$element->getContent() . ' '; + // } + + // // 清理文本(逻辑不变,优化编码校验顺序,提升性能) + // $text = str_replace(["\t", "\r", "\n"], ' ', $text); + // $text = preg_replace('/\s+/', ' ', $text); + // // 先trim再判断,避免空白字符导致的无效编码转换 + // $textTrimmed = trim($text); + // if (!empty($textTrimmed) && !mb_check_encoding($textTrimmed, 'UTF-8')) { + // $text = mb_convert_encoding($text, 'UTF-8', 'GBK'); + // } + + // return $text; + // } /** * 从 Word 文档提取摘要和关键词 * @return array 提取结果 */ + function extractContentIntervals($str, $markers = []) { + // 1. 初始化标记(支持自定义,默认值兼容原逻辑) + $defaultMarkers = [ + 'abstract' => 'abstract', + 'keywords' => 'keywords', + 'end_span' => '===========end-span' + ]; + $markers = array_merge($defaultMarkers, $markers); + extract($markers); // 解析为变量 $abstract, $keywords, $end_span + + // 2. 初始化结果(包含元信息) + $result = [ + 'abstract_to_keywords' => '', + 'keywords_to_end' => '', + 'positions' => [ // 标记位置信息(-1 表示未找到) + 'abstract' => -1, + 'keywords' => -1, + 'end_span' => -1 + ], + 'is_valid' => false, // 整体区间是否有效 + 'error' => '' // 错误信息(如标记顺序异常) + ]; + + // 3. 定位 Abstract(不区分大小写) + $absPos = stripos($str, $abstract); + if ($absPos === false) { + $result['error'] = "未找到标记: {$abstract}"; + return $result; + } + $result['positions']['abstract'] = $absPos; + $absEndPos = $absPos + strlen($abstract); + + // 4. 定位 Keywords(需在 Abstract 之后,不区分大小写) + $keyPos = stripos($str, $keywords, $absEndPos); + if ($keyPos === false) { + $result['error'] = "未找到 {$keywords} 或在 {$abstract} 之前"; + return $result; + } + $result['positions']['keywords'] = $keyPos; + $keyEndPos = $keyPos + strlen($keywords); + + // 5. 定位 end-span(需在 Keywords 之后,严格匹配) + $endPos = strpos($str, $end_span, $keyEndPos); + if ($endPos === false) { + $result['error'] = "未找到 {$end_span} 或在 {$keywords} 之前"; + return $result; + } + $result['positions']['end_span'] = $endPos; + + // 6. 截取区间内容(清理标记后的紧邻符号) + // 区间1:Abstract 结束 → Keywords 开始(清理标记后的冒号/空格) + $len1 = $keyPos - $absEndPos; + $part1 = substr($str, $absEndPos, $len1); + $part1 = trim($part1); + // 移除 Abstract 后可能的冒号/短横线(如 "Abstract: ..." → 去掉开头的 ":") + $part1 = ltrim($part1, ': -—'); + $result['abstract_to_keywords'] = trim($part1); + + // 区间2:Keywords 结束 → end-span 开始(同理清理) + $len2 = $endPos - $keyEndPos; + $part2 = substr($str, $keyEndPos, $len2); + $part2 = trim($part2); + $part2 = ltrim($part2, ': -—'); + $result['keywords_to_end'] = trim($part2); + + // 7. 标记为有效 + $result['is_valid'] = true; + return $result; + } public function extractFromWord() { $sContent = ''; //文本处理 $sFundContent = ''; + $aContent = []; foreach ($this->sections as $section) { foreach ($section->getElements() as $element) { $textContent = $this->getTextFromElement($element); if(empty($textContent)){ continue; } - //编码修复 - $possibleEncodings = [ - 'Windows-1252', 'UTF-8', 'GBK', 'GB2312', - 'Latin-1', 'ISO-8859-1', 'CP1252' - ]; - $sContent .= @mb_convert_encoding($textContent, 'UTF-8', implode(',', $possibleEncodings)); - if(stripos($textContent, 'Keywords:') !== false){ - $sContent .= "Keywords-End-Flag"; + if(!empty($textContent) && !mb_check_encoding($textContent, 'UTF-8')){ + $textContent = mb_convert_encoding($textContent, 'UTF-8', 'GBK'); } if(empty($sFundContent)){ $aFund = $this->getMatchedFundPhrases($sContent); @@ -1152,69 +1121,348 @@ class ArticleParserService } } } - $sContent .= "\n"; + $sContent .= $textContent."===========end-span"; } } - if(!empty($sContent) && !mb_check_encoding($sContent, 'UTF-8')){ $sContent = mb_convert_encoding($sContent, 'UTF-8', 'GBK'); } - // 2. 基础文本清理(合并多余空格,保留有效换行) - $textContent = preg_replace('/(\S)\s+/', '$1 ', $sContent); - $textContent = trim($textContent); - + $result = $this->extractContentIntervals($sContent); // 3. 提取摘要 - $abstract = ''; - $abstractPattern = '/Abstract\s*([\s\S]*?)(?=Keywords|$)/i'; - if (preg_match($abstractPattern, $textContent, $abstractMatches)) { - $abstract = trim($abstractMatches[1]); - $abstract = preg_replace('/\n+/', ' ', $abstract); + $abstract = empty($result['abstract_to_keywords']) ? '' : $result['abstract_to_keywords']; + if(!empty($abstract) && !mb_check_encoding($abstract, 'UTF-8')){ + $abstract = mb_convert_encoding($abstract, 'UTF-8', 'GBK'); } - // 4. 提取关键词(核心:仅保留两种强制匹配逻辑) - $keywords = []; - // $keywordPattern = '/Keywords:\s*([\s\S]*?)(?=\s*\d+\.|[;,]\s*[\r\n]+\s*[\r\n]+|(?i)\bintroduction|abbreviations\b|$)/i'; - $keywordPattern = '/Keywords\s*(.*?)\s*Keywords-End-Flag/s'; - - if (preg_match($keywordPattern, $textContent, $keywordMatches)) { - $keywordStr = trim($keywordMatches[1]); - - // 清理关键词列表格式(去除换行、末尾多余符号) - $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); - $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 - $keywordStr = trim($keywordStr); - - // 分割并过滤有效关键词 - $keywords = preg_split('/[,;]\s*/', $keywordStr); - $keywords = array_filter(array_map('trim', $keywords), function($item) { - return !empty($item) && !ctype_space($item); - }); + $keywords = empty($result['keywords_to_end']) ? '' : $result['keywords_to_end']; + if(!empty($keywords) && !mb_check_encoding($keywords, 'UTF-8')){ + $keywords = mb_convert_encoding($keywords, 'UTF-8', 'GBK'); } - if(empty($keywords)){ - $keywordPattern = '/Keywords\s*([\s\S]*?)(?=Introduction|$)/i'; - if (preg_match($keywordPattern, $textContent, $keywordMatches)) { - $keywordStr = trim($keywordMatches[1]); - // 清理关键词列表格式(去除换行、末尾多余符号) - $keywordStr = preg_replace('/\n+/', ' ', $keywordStr); - $keywordStr = rtrim($keywordStr, ';,. '); // 去除末尾分号、逗号等 - $keywordStr = trim($keywordStr); - - // 分割并过滤有效关键词 - $keywords = preg_split('/[,;]\s*/', $keywordStr); - $keywords = array_filter(array_map('trim', $keywords), function($item) { - return !empty($item) && !ctype_space($item); - }); - } + if(!empty($sFundContent) && !mb_check_encoding($sFundContent, 'UTF-8')){ + $sFundContent = mb_convert_encoding($sFundContent, 'UTF-8', 'GBK'); } + return [ 'status' => 1, 'msg' => '提取成功', 'data' => [ - 'abstrart' => $abstract, - 'keywords' => $keywords, - 'fund' => $sFundContent + 'abstrart' => empty($abstract) ? '' : $this->fullDecode(str_replace('===========end-span', '',$abstract)), + 'keywords' => empty($keywords) ? '' : $this->fullDecode(str_replace('===========end-span', '',$keywords)), + 'fund' => empty($sFundContent) ? '' : $this->fullDecode(str_replace('===========end-span', '',$sFundContent)) ] ]; } + /** + * 核心解码方法 + * @param string $str 待解码字符串 + * @param int $maxDepth 最大解析深度 + * @return string + */ + private function fullDecode($str = '', int $maxDepth = 2){ + try { + if ($str === null || trim((string)$str) === '' || $maxDepth <= 0) { + return $str === null ? '' : trim((string)$str); + } + + $str = (string)$str; + + // Unicode解码 + if (method_exists($this, 'decodeUnicode')) { + $str = $this->decodeUnicode($str); + } else { + $str = preg_replace_callback( + '/\\\\[uU]([0-9a-fA-F]{4})/', + function ($m) { + $code = hexdec($m[1]); + return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; + }, + $str + ); + } + + // 预编译正则 + $regexps = [ + 'ob0' => '/0B\s*\\?0/', + 'dl18' => '/DL\s*\\?\.18/', + 'qMarkNum' => '/\\?(\d+)/', + 'qMarkDotNum' => '/\\?(\.\d+)/', + 'neNum' => '/≠\s*(\d+)/u', + 'leNum' => '/≤\s*(\d+)/u', + 'mixSymbol' => '/\\?\s*(?:、|,)\s*\\?\s*(?:、|,)\s*\\?(\d+)/u', + 'leNeMark' => '/(LE|NE)\s*\\?(\d+)/i', + 'unicode' => '/\\\\[uU]([0-9a-fA-F]{4})/', + 'wordBin' => '/(?:\\xE2\\x89\\x86|\\xE2\s*0x89\s*0x86|e28986|\\xE2\\x89\\x87|\\xE2\s*0x89\s*0x87|e28987|\\xE2\\x89\\x80|\\xE2\s*0x89\s*0x80|e28980)/i', + 'wordEntity' => '/&#\s*(?:x|X)?\s*(2264|2265|2260)\s*;?/i', + 'repeatSymbol' => '/(≤|≥|≠)\1+/u', + 'gbkSymbol' => '/\xA1\xF2|\xA1\xF3|\xA1\xF0/' + ]; + + // 预定义替换映射 + $maps = [ + 'htmlEntity' => [ + '≤' => '≤', '≤' => '≤', '≤' => '≤', '≤' => '≤', + '≥' => '≥', '≥' => '≥', '≥' => '≥', '≥' => '≥', + '≠' => '≠', '≠' => '≠', '≠' => '≠', '≠' => '≠', + '&le' => '≤', '&ge' => '≥', '&ne' => '≠', + 'ࣘ' => '≤', 'ࣙ' => '≥', 'ࣔ' => '≠', + '≤' => '≤', '≥' => '≥', '≠' => '≠', + '<' => '≤', '>' => '≥', + ], + 'wordBin' => [ + "\xE2\x89\x86" => '≤', "\xE2\x89\x87" => '≥', "\xE2\x89\x80" => '≠', + "\xe2\x89\x86" => '≤', "\xe2\x89\x87" => '≥', "\xe2\x89\x80" => '≠', + 'e28986' => '≤', '\xe2\x89\x86' => '≤', '\xe20x890x86' => '≤', + 'e28987' => '≥', '\xe2\x89\x87' => '≥', '\xe20x890x87' => '≥', + 'e28980' => '≠', '\xe2\x89\x80' => '≠', '\xe20x890x80' => '≠', + ], + 'wordEntity' => ['2264' => '≤', '2265' => '≥', '2260' => '≠'], + 'gbkSymbol' => ["\xA1\xF2" => '≤', "\xA1\xF3" => '≥', "\xA1\xF0" => '≠'], + ]; + + $unicodeCallback = function ($m) { + $code = hexdec($m[1]); + return ($code >= 0x20 && $code <= 0x10FFFF) ? mb_chr($code, 'UTF-8') : $m[0]; + }; + + $depth = 0; + $hasChange = false; + $currentStr = $str; + + // 循环解码 + do { + $depth++; + $hasChange = false; + $prevStr = $currentStr; + + // Unicode转义解码 + $currentStr = preg_replace_callback($regexps['unicode'], $unicodeCallback, $currentStr); + + //HTML实体替换 + $currentStr = strtr($currentStr, $maps['htmlEntity']); + $currentStr = html_entity_decode( + $currentStr, + ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE, + 'UTF-8' + ); + + // Word特殊符号乱码修复 + if (preg_match($regexps['wordBin'], $currentStr)) { + $tempStr = str_replace(' ', '', $currentStr); + $currentStr = str_ireplace(array_keys($maps['wordBin']), $maps['wordBin'], $tempStr); + } + if (preg_match($regexps['wordEntity'], $currentStr)) { + $currentStr = preg_replace_callback( + $regexps['wordEntity'], + function ($m) use ($maps) { + return $maps['wordEntity'][$m[1]] ?? $m[0]; + }, + $currentStr + ); + } + if (preg_match($regexps['gbkSymbol'], $currentStr)) { + $currentStr = strtr($currentStr, $maps['gbkSymbol']); + } + if (preg_match($regexps['repeatSymbol'], $currentStr)) { + $currentStr = preg_replace($regexps['repeatSymbol'], '$1', $currentStr); + } + + //业务场景专属替换 + if (preg_match($regexps['neNum'], $currentStr)) { + $currentStr = preg_replace($regexps['neNum'], '≠$1', $currentStr); + } + if (preg_match($regexps['leNum'], $currentStr)) { + $currentStr = preg_replace($regexps['leNum'], '≤$1', $currentStr); + } + if (preg_match($regexps['qMarkNum'], $currentStr)) { + $currentStr = preg_replace($regexps['qMarkNum'], '≥$1', $currentStr); + } + if (preg_match($regexps['qMarkDotNum'], $currentStr)) { + $currentStr = preg_replace($regexps['qMarkDotNum'], '≥0$1', $currentStr); + } + if (preg_match($regexps['mixSymbol'], $currentStr)) { + $currentStr = preg_replace($regexps['mixSymbol'], '≤≥≠$1', $currentStr); + } + if (preg_match($regexps['leNeMark'], $currentStr)) { + $currentStr = preg_replace_callback( + $regexps['leNeMark'], + function ($m) { + return strtoupper($m[1]) === 'LE' ? '≤' . $m[2] : '≠' . $m[2]; + }, + $currentStr + ); + } + + $hasChange = ($currentStr !== $prevStr); + } while ($depth < $maxDepth && $hasChange); + + // 最终清理 + $currentStr = trim($currentStr, ':'); + $currentStr = strtr($currentStr, $maps['htmlEntity']); + + return $currentStr; + + } catch (\Throwable $e) { + return trim((string)$str); + } + } + + // private function fullDecode($str, $maxDepth = 5) { + // // 空值/深度为0,直接返回(提前终止,避免无效操作) + // if (empty($str) || $maxDepth <= 0) { + // return $str; + // } + + // // 【性能优化1:预编译所有正则表达式】避免每次循环重新解析正则 + // // 预编译:≥专属场景正则 + // $regOb0 = '/0B\s*\?0/'; + // $regDl18 = '/DL\s*\?.18/'; + // // 预编译:≥通用场景正则 + // $regQMarkNum = '/\?(\d+)/'; + // $regQMarkDotNum = '/\?(\.\d+)/'; + // // 预编译:≤、≠空格修复正则 + // $regNeNum = '/≠\s*(\d+)/'; + // $regLeNum = '/≤\s*(\d+)/'; + // // 预编译:混合符号乱码正则(中文顿号/英文逗号) + // $regMixCn = '/(\?)\s*、\s*(\?)\s*、\s*(\?)(\d+)/'; + // $regMixEn = '/(\?)\s*,\s*(\?)\s*,\s*(\?)(\d+)/'; + // // 预编译:≤、≠专属标识正则 + // $regLeMark = '/LE\s*\?(\d+)/'; + // $regNeMark = '/NE\s*\?(\d+)/'; + // // 预编译:Unicode转义正则(提取到外部,避免闭包重复创建) + // $regUnicode = '/\\\\u([0-9a-fA-F]{4})/'; + + // // 【性能优化2:预定义常量/映射】避免循环内重复创建数组/字符串 + // // HTML实体映射(一次性定义,避免循环内重复赋值) + // $htmlEntityMap = [ + // '≤' => '≤', '≤' => '≤', '≤' => '≤', + // '≥' => '≥', '≥' => '≥', '≥' => '≥', + // '≠' => '≠', '≠' => '≠', '≠' => '≠', + // ]; + // // 不间断空格替换数组 + // $nbspReplace = [chr(0xC2) . chr(0xA0), chr(0xA0)]; + // // Unicode回调函数(预定义,避免循环内重复创建闭包) + // $unicodeCallback = function ($m) { + // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; + // }; + + // $original = $str; + // $depth = 0; + // $hasChange = false; // 标记是否有变化,提前终止循环 + + // // 循环解码:仅在有变化且未达最大深度时执行 + // do { + // $depth++; + // $hasChange = false; + // $prevStr = $str; // 保存当前状态,用于判断变化 + + // // 1. 解码Unicode转义(\uXXXX格式) + // $str = $this->decodeUnicode($str); + + // // 2. 解码HTML实体(先替换专属实体,再执行通用解码) + // $str = strtr($str, $htmlEntityMap); // 高性能替换(strtr比str_replace快) + // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + + // // 3. 再次处理遗漏的Unicode转义(使用预编译正则+预定义回调) + // $str = preg_replace_callback($regUnicode, $unicodeCallback, $str); + + // // 4. 替换不间断空格为普通空格(strtr比str_replace更高效) + // $str = str_replace($nbspReplace, ' ', $str); + + // // 5. 核心替换逻辑(优化执行顺序,避免覆盖) + // // 5.1 原有≥专属场景(保留) + // $str = preg_replace($regOb0, '0B≥30', $str, -1, $count1); + // $str = preg_replace($regDl18, 'DL≥0.18', $str, -1, $count2); + // // 5.2 ≤、≠空格修复(保留) + // $str = preg_replace($regNeNum, '≠$1', $str, -1, $count3); + // $str = preg_replace($regLeNum, '≤$1', $str, -1, $count4); + // // 5.3 原有≥通用场景(保留) + // $str = preg_replace($regQMarkNum, '≥$1', $str, -1, $count5); + // $str = preg_replace($regQMarkDotNum, '≥0$1', $str, -1, $count6); + // // 5.4 混合符号乱码还原(保留) + // $str = preg_replace($regMixCn, '≤、≥、≠$4', $str, -1, $count7); + // $str = preg_replace($regMixEn, '≤、≥、≠$4', $str, -1, $count8); + // // 5.5 ≤、≠专属标识还原(保留) + // $str = preg_replace($regLeMark, '≤$1', $str, -1, $count9); + // $str = preg_replace($regNeMark, '≠$1', $str, -1, $count10); + + // // 5.6 修复前缀"d with "乱码(保留) + // $str = str_replace('d with ', 'd with ', $str, $count11); + + // // 【性能优化3:统计所有替换次数,判断是否有变化】 + // $totalCount = $count1 + $count2 + $count3 + $count4 + $count5 + $count6 + + // $count7 + $count8 + $count9 + $count10 + $count11; + // if ($totalCount > 0 || $str !== $prevStr) { + // $hasChange = true; + // $original = $str; + // } + + // // 【性能优化4:提前终止】单次循环无变化,直接退出 + // if (!$hasChange) { + // break; + // } + + // } while ($depth < $maxDepth); // 改用do-while,减少循环判断次数 + + // // 最终清理:仅执行一次trim + // return trim($str, ':'); + // } + // private function fullDecode($str, $maxDepth = 5) { + // if (empty($str) || $maxDepth <= 0) { + // return $str; + // } + + // $original = $str; + // $depth = 0; + + // // 循环解码,直到无变化或达到最大次数 + // while (true) { + // $depth++; + // if ($depth > $maxDepth) { + // break; // 防止过度解码导致死循环 + // } + + // // 1. 解码 Unicode 转义(\uXXXX 格式) + // $str = $this->decodeUnicode($str); + + // // 2. 解码 HTML 实体(&、'、< 等) + // $str = html_entity_decode($str, ENT_QUOTES | ENT_HTML5, 'UTF-8'); + + // $str = preg_replace_callback('/\\\\u([0-9a-fA-F]{4})/', function ($m) { + // return mb_chr(hexdec($m[1]), 'UTF-8') ?: $m[0]; + // }, $str); + // $str = str_replace([chr(0xC2).chr(0xA0), chr(0xA0)], ' ', $str); + + // // 2. 核心:强制匹配所有可能的乱码格式,还原≥ + // // 匹配:0B?0、0B ?0、0B ?0(空格/制表符)→ 0B≥30 + // $str = preg_replace('/0B\s*\?0/', '0B≥30', $str); + // // 匹配:DL?.18、DL ?.18、DL ?.18 → DL≥0.18 + // $str = preg_replace('/DL\s*\?.18/', 'DL≥0.18', $str); + // // 通用匹配:数字前的?(如?30、?0.18)→ ≥30、≥0.18(防止其他变体) + // $str = preg_replace('/\?(\d+)/', '≥$1', $str); + // $str = preg_replace('/\?(\.\d+)/', '≥0$1', $str); + + // // 3. 修复前缀的"d with "可能的乱码(若有) + // $str = str_replace('d with ', 'd with ', $str); // 若前缀也乱码,可同步替换 + + // // 若解码后无变化,退出循环 + // if ($str === $original) { + // break; + // } + + // $original = $str; + // } + + // return trim($str,':'); + // } + private function decodeUnicode($str) { + return preg_replace_callback( + '/\\\\u([0-9a-fA-F]{4})/', + function ($matches) { + // 将十六进制 Unicode 码转为 UTF-8 字符 + return mb_convert_encoding(pack('H*', $matches[1]), 'UTF-8', 'UCS-2BE'); + }, + $str + ); + } private function getMatchedFundPhrases($content = '') { if (empty($content)) { return []; @@ -1223,7 +1471,7 @@ class ArticleParserService // 基金支持词组列表 $fundPhrases = [ 'Supported by', 'Funded by', 'Sponsored by', 'Supported in part by', - 'Funding was provided by', 'Funded in part by' + 'Funding was provided by', 'Funded in part by','FUNDING:' ]; // 1. 转义词组中的特殊字符,使用 # 作为分隔符