/** * h2/h3 セクションを解析して、見出しテキストと直下の p 群(次の見出しの直前まで)を返す * 返り値: [ ['h_text'=>string,'h_start'=>int,'h_end'=>int,'content_start'=>int,'paras'=>[ ['html'=>string,'start'=>int,'end'=>int,'text'=>string,'link_count'=>int], ... ]], ... ] */ private function parse_hx_sections(string $html, string $level = 'h3'): array { $sections = []; $pattern_head = ($level === 'h2') ? '/]*>(.*?)<\/h2>/is' : '/]*>(.*?)<\/h3>/is'; if (!preg_match_all($pattern_head, $html, $heads, PREG_OFFSET_CAPTURE)) return []; // 次見出しの検出用(h2/h3両方) preg_match_all('/]*>/is', $html, $all_heads, PREG_OFFSET_CAPTURE); $all_pos = array_map(fn($m) => $m[1], $all_heads[0] ?? []); foreach ($heads[0] as $idx => $headMatch) { $h_full = $headMatch[0]; $h_start = $headMatch[1]; $h_len = strlen($h_full); $h_end = $h_start + $h_len; // 見出しのプレーンテキスト $h_text = ''; if (isset($heads[1][$idx][0])) { $h_text = trim(wp_strip_all_tags($heads[1][$idx][0])); } // この見出し以降で、次の見出し開始位置を探す $next_pos = strlen($html); foreach ($all_pos as $p) { if ($p > $h_start) { $next_pos = $p; break; } } $section_html = substr($html, $h_end, $next_pos - $h_end); $paras = []; if (preg_match_all('/]*>.*?<\/p>/is', $section_html, $pm, PREG_OFFSET_CAPTURE)) { foreach ($pm[0] as $pMatch) { $p_rel_start = $pMatch[1]; $p_start = $h_end + $p_rel_start; $p_html = $pMatch[0]; $p_end = $p_start + strlen($p_html); $p_text = trim(preg_replace('/\s+/u', ' ', wp_strip_all_tags($p_html))); $link_c = substr_count($p_html, ' $p_html, 'start' => $p_start, 'end' => $p_end, 'text' => $p_text, 'link_count' => $link_c, ]; } } $sections[] = [ 'h_text' => $h_text, 'h_start' => $h_start, 'h_end' => $h_end, 'content_start' => $h_end, 'paras' => $paras, ]; } return $sections; } /** 上中下に近いセクションindexを返す(本文量ベース) */ private function choose_section_indices_by_ratio(array $sections, array $ratios): array { $weights = []; foreach ($sections as $s) { $len = 0; foreach ($s['paras'] as $p) $len += max(1, mb_strlen($p['text'], 'UTF-8')); $weights[] = max(1, $len); } $total = array_sum($weights); if ($total <= 0) { $weights = array_fill(0, count($sections), 1); $total = count($sections); } $indices = []; foreach ($ratios as $r) { $target = (int) floor($total * $r); $acc = 0; $chosen = 0; foreach ($weights as $i => $w) { $acc += $w; if ($acc >= $target) { $chosen = $i; break; } } // 近接回避 if (!in_array($chosen, $indices, true)) $indices[] = $chosen; } // 3件に満たない場合、近いものを追加 while (count($indices) < 3 && count($sections) > count($indices)) { for ($i=0; $i= 0; $i--) { if ((int)$paras[$i]['link_count'] <= $max_links_per_p) { $chosen = $paras[$i]; break; } } if ($chosen === null) $chosen = end($paras); return (int) $chosen['end']; // ← p閉じタグの直後 } // pが無ければ、見出し直後 return (int) $section['content_start']; } /** 予備:本文全体の

を拾って 15/50/85% あたりの直後に差し込み位置を作る(既に確保済みは重複回避) */ private function fallback_insertion_points_by_paragraphs(string $html, int $need, array $have): array { $out = $have; if (!preg_match_all('/]*>.*?<\/p>/is', $html, $pm, PREG_OFFSET_CAPTURE)) return $out; $paras = array_map(function($m){ return ['start'=>$m[1], 'end'=>$m[1]+strlen($m[0])]; }, $pm[0]); if (empty($paras)) return $out; $ratios = [0.15, 0.50, 0.85]; $targets = []; $total = count($paras); foreach ($ratios as $r) $targets[] = $paras[(int) floor(($total-1)*$r)]['end']; $used_offsets = array_map(fn($x)=> (int)$x['offset'], $out); foreach ($targets as $i => $off) { if (count($out) >= $need) break; // 近接回避 ~80文字 $near = false; foreach ($used_offsets as $u) { if (abs($u - $off) < 80) { $near = true; break; } } if ($near) continue; $out[] = ['offset'=>$off, 'zone'=>['top','mid','bot'][$i] ?? 'mid', 'ctx_heading'=>'', 'ctx_text'=>'']; $used_offsets[] = $off; } return array_slice($out, 0, $need); } /** 目標が3未満のときの尻保険:原文末尾に3つ足す(原文は維持) */ private function append_n_blocks_to_tail(string $original, WP_Post $source, array $candidate_ids, int $n): string { $targets = []; foreach ($candidate_ids as $cid) { if (count($targets) >= $n) break; $targets[] = ['post_id'=>$cid,'url'=>get_permalink($cid),'title'=>get_the_title($cid)]; } if (empty($targets)) return $original; $result = $original; for ($i=0; $i<$n; $i++) { $tg = $targets[$i % count($targets)]; $meta = $this->fetch_dest_meta($tg['url'], $tg['post_id'], $tg['title']); $sentence = $this->fallback_sentence_with_meta('本文の補足', $meta, $tg['url']); $result .= '

'.$this->force_link_attrs($sentence).'

'; } return $result; } /** * 各ゾーン文脈に最も近い遷移先を選ぶ(同タグ/自リンク以外/重複禁止) * return: [ ['block_index'=>int(dummy),'post_id'=>int,'url'=>string,'title'=>string,'score'=>float], ... ] */ private function pick_targets_for_sections(WP_Post $source, array $sections, array $sec_indices, array $candidate_ids, int $need): array { $cands = []; foreach ($candidate_ids as $cid) { $p = get_post($cid); if (!$p) continue; $url = get_permalink($p); $meta = $this->fetch_dest_meta($url, $cid, get_the_title($p)); $tokens = $this->tokenize_to_set($meta['title'].' '.$meta['desc'].' '.implode(' ', $meta['heads'])); $cands[$cid] = ['post_id'=>$cid, 'url'=>$url, 'title'=>get_the_title($cid), 'tokens'=>$tokens]; } if (empty($cands)) return []; $used = []; $out = []; $sim_th = (float) get_option(self::OPTION_SIM_THRESHOLD, 0.02); foreach ($sec_indices as $sidx) { $ctx = ($sections[$sidx]['h_text'] ?? ''); if (!empty($sections[$sidx]['paras'])) { $ctx .= ' '.implode(' ', array_column($sections[$sidx]['paras'], 'text')); } $ctx_tokens = $this->tokenize_to_set($ctx); $best_cid = null; $best_sc = -1; foreach ($cands as $cid => $info) { if (in_array($cid, $used, true)) continue; $sc = $this->jaccard($ctx_tokens, $info['tokens']); // 見出し類との近さも加点 $sc += 0.05 * $this->jaccard( $this->tokenize_to_set($source->post_title.' '.$this->extract_headings_text($source->post_content)), $info['tokens'] ); if ($sc > $best_sc) { $best_sc = $sc; $best_cid = $cid; } } if ($best_cid !== null && ($best_sc >= $sim_th || empty($out))) { $out[] = ['block_index'=>0, 'post_id'=>$best_cid, 'url'=>$cands[$best_cid]['url'], 'title'=>$cands[$best_cid]['title'], 'score'=>$best_sc]; $used[] = $best_cid; } if (count($out) >= $need) break; } // 足りなければ全体近傍から補完 if (count($out) < $need) { $global = $this->tokenize_to_set( $source->post_title.' '.$this->extract_headings_text($source->post_content).' '.wp_strip_all_tags($source->post_content) ); $rank = []; foreach ($cands as $cid => $info) { if (in_array($cid, $used, true)) continue; $rank[$cid] = $this->jaccard($global, $info['tokens']); } arsort($rank, SORT_NUMERIC); foreach (array_keys($rank) as $cid) { if (count($out) >= $need) break; $out[] = ['block_index'=>0, 'post_id'=>$cid, 'url'=>$cands[$cid]['url'], 'title'=>$cands[$cid]['title'], 'score'=>$rank[$cid]]; $used[] = $cid; } } return array_slice($out, 0, $need); }