links = [] for i, chapter inenumerate(results, 1): link = [chapter['chapter_title']] for j, sub inenumerate(chapter['subsections'], 1): link.append(sub['link']) links.append(link)
return links
# 处理html源码 defhtml_to_markdown(html): soup = BeautifulSoup(html, 'html.parser') # 移除不需要的元素 for tag in soup.find_all(['script', 'style']): tag.decompose()
for msgopt in soup.find_all(['div'], class_=['mailOptions']): msgopt.replace_with(f"**{msgopt.get_text(strip=True)}**\n")
for tag in soup.find_all('div', {'style': 'display:none'}): tag.decompose() # 转换特殊标签 for ruby in soup.find_all('ruby'): rb = ruby.find('rb').get_text(strip=True) if ruby.find('rb') else'' rt = ruby.find('rt').get_text(strip=True) if ruby.find('rt') else'' ruby.replace_with(f"{rb}({rt})"if rt else rb)
# 处理折叠框 for fold in soup.find_all(class_='foldFrame'): title = fold.find(class_='foldTitle').get_text(strip=True) if fold.find(class_='foldTitle') else'' content = fold.find(class_='foldContent') if content: content_str = content.get_text('\n', strip=True) fold.replace_with(f"**{title}**\n{content_str}\n")
# 处理剧情选项 for plot in soup.find_all(class_='plotFrame'): options = [opt.get_text(strip=True) for opt in plot.find_all(class_='plotOptions')] # contents = [cont.get_text('\n', strip=True) for cont in plot.find_all(class_='content')] contents = [cont.get_text(strip=True) for cont in plot.find_all(class_='content')] result = "\n".join([f"- {opt}"for opt in options] + contents) plot.replace_with(result)
# for img in soup.find_all('img'): # text = img.get_text(strip=True) # if img.get('alt'): # img.replace_with(f"")
# for a in soup.find_all('a'): # text = a.get_text(strip=True) # href = a.get('href', '') # a.replace_with(f"" if href else text)
# 处理标题和列表 for h2 in soup.find_all('h2'): h2.replace_with(f"## {h2.get_text(strip=True)}\n")
for h3 in soup.find_all('h3'): h3.replace_with(f"**{h3.get_text(strip=True)}**\n")
for ul in soup.find_all('ul'): for li in ul.find_all('li'): li_text = li.get_text(strip=True) li.string = f"{li_text}" if ul.parent isnotNone: ul.unwrap()
# 处理块引用 for blockquote in soup.find_all('blockquote'): text = blockquote.get_text('\n', strip=True) blockquote.replace_with(f"*{text}*\n")
# 处理定义列表 for dl in soup.find_all('dl'): items = [f"- {dd.get_text(strip=True)}"for dd in dl.find_all('dd')] dl.replace_with("\n".join(items) + "\n")
defmerge_txt_files(root_path, output_file="XT1.txt"): """合并文件夹中的txt文件""" # 收集并排序文件夹(按开头的数字) folders = [] for entry in os.scandir(root_path): if entry.is_dir() and re.match(r'^\d', entry.name): folders.append(entry.path) folders.sort(key=lambda x: natural_sort_key(os.path.basename(x)))
withopen(output_file, 'w', encoding='utf-8') as outfile: for folder in folders: # 添加一级标题(文件夹名称) folder_name = os.path.basename(folder) clean_folder = clean_title(folder_name) outfile.write(f"# {clean_folder}\n\n")
# 收集并排序txt文件(按开头的数字) txt_files = glob.glob(os.path.join(folder, "*.txt")) txt_files = [f for f in txt_files if re.match(r'^\d', os.path.basename(f))] txt_files.sort(key=lambda x: natural_sort_key(os.path.basename(x)))
for txt_file in txt_files: # 添加二级标题(文件名) file_name = os.path.splitext(os.path.basename(txt_file))[0] clean_file = clean_title(file_name) outfile.write(f"## {clean_file}\n\n")