使用python提取网页中的文本信息

ELIX2025-06-172025-06-17

介绍

一个python脚本，用于获取《崩坏：星穹铁道》的剧情信息。大致思路是：

1.获取各章节的子链接；

2.获取子链接的html源码；

3.处理html，去除无效信息，并将各html标签转为md标签（如option转为’>’等，便于在blog中展示）

4.合并各章节内容，生成目录，并再次清洗

本次应用核心在于对信息的处理，例如对于特定的标签的特殊处理，标题范围的控制（处理不当会出现整段文章被识别为标题的问题），以及多余空行的清理等，整体难度甚至低于之前音乐爬取的应用。

编码

1.获取html信息，并按照章节保存。此步骤会对数据进行初步处理，并将html标签转为md的标签。

import os
import re
import urllib.parse
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

# 获取所有章节链接
def extract_chapters(url):
    # 发送请求获取网页内容
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    results = []

    # 查找所有章节容器
    chapter_containers = soup.find_all('div', class_='mission-line')

    for container in chapter_containers:
        # 提取章节标题和链接
        chapter_title = container.a.get_text(strip=True)
        chapter_link = container.a['href']

        # 查找关联的小章节容器
        wrap_content = container.find_next('div', class_='wrap-content')

        subsections = []
        if wrap_content:
            # 提取所有小章节
            for subsection in wrap_content.find_all('div', class_='mission-line-child-kt'):
                a_tag = subsection.find('a')
                if a_tag:
                    sub_title = a_tag.get_text(strip=True)
                    sub_link = a_tag['href']
                    subsections.append({
                        'title': sub_title,
                        'link': sub_link
                    })

        # 添加到结果集
        results.append({
            'chapter_title': chapter_title,
            'chapter_link': chapter_link,
            'subsections': subsections
        })

    links = []
    for i, chapter in enumerate(results, 1):
        link = [chapter['chapter_title']]
        for j, sub in enumerate(chapter['subsections'], 1):
            link.append(sub['link'])
        links.append(link)

    return links

# 处理html源码
def html_to_markdown(html):
    soup = BeautifulSoup(html, 'html.parser')
    # 移除不需要的元素
    for tag in soup.find_all(['script', 'style']):
        tag.decompose()

    #针对对话短信内容优化
    for msg in soup.find_all(['div'], class_=['MessageToMe', 'MessageFromMe']):
        # 提取发送者名称
        sender = msg.find('div', class_='SenderName')
        sender_name = sender.get_text(strip=True) if sender else "未知发送者"
        # 提取消息内容
        content = ""
        if msg.find('div', class_='MessageLeft'):
            content = msg.find('div', class_=['MessageLeft', 'MessageRight']).get_text(strip=True)
        elif msg.find('div', class_='MessageRight'):
            content = msg.find('div', class_='MessageRight').get_text(strip=True)
        msg.replace_with(f"{sender_name}: {content}")

    for msgopt in soup.find_all(['div'], class_=['mailOptions']):
        msgopt.replace_with(f"**{msgopt.get_text(strip=True)}**\n")

    for tag in soup.find_all('div', {'style': 'display:none'}):
        tag.decompose()
    # 转换特殊标签
    for ruby in soup.find_all('ruby'):
        rb = ruby.find('rb').get_text(strip=True) if ruby.find('rb') else ''
        rt = ruby.find('rt').get_text(strip=True) if ruby.find('rt') else ''
        ruby.replace_with(f"{rb}({rt})" if rt else rb)

    # 处理折叠框
    for fold in soup.find_all(class_='foldFrame'):
        title = fold.find(class_='foldTitle').get_text(strip=True) if fold.find(class_='foldTitle') else ''
        content = fold.find(class_='foldContent')
        if content:
            content_str = content.get_text('\n', strip=True)
            fold.replace_with(f"**{title}**\n{content_str}\n")

    # 处理剧情选项
    for plot in soup.find_all(class_='plotFrame'):
        options = [opt.get_text(strip=True) for opt in plot.find_all(class_='plotOptions')]
        # contents = [cont.get_text('\n', strip=True) for cont in plot.find_all(class_='content')]
        contents = [cont.get_text(strip=True) for cont in plot.find_all(class_='content')]
        result = "\n".join([f"- {opt}" for opt in options] + contents)
        plot.replace_with(result)


    # for img in soup.find_all('img'):
    #     text = img.get_text(strip=True)
    #     if img.get('alt'):
    #         img.replace_with(f"![{text}]({'https://wiki.biligame.com'+'alt'})")

    # for a in soup.find_all('a'):
    #     text = a.get_text(strip=True)
    #     href = a.get('href', '')
    #     a.replace_with(f"![{text}]({'https://wiki.biligame.com'+href})" if href else text)

    # 处理标题和列表
    for h2 in soup.find_all('h2'):
        h2.replace_with(f"## {h2.get_text(strip=True)}\n")

    for h3 in soup.find_all('h3'):
        h3.replace_with(f"**{h3.get_text(strip=True)}**\n")

    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li_text = li.get_text(strip=True)
            li.string = f"{li_text}"
        if ul.parent is not None:
            ul.unwrap()

    # 处理块引用
    for blockquote in soup.find_all('blockquote'):
        text = blockquote.get_text('\n', strip=True)
        blockquote.replace_with(f"*{text}*\n")

    # 处理定义列表
    for dl in soup.find_all('dl'):
        items = [f"- {dd.get_text(strip=True)}" for dd in dl.find_all('dd')]
        dl.replace_with("\n".join(items) + "\n")

    # 获取最终文本并清理
    text = soup.get_text('\n', strip=True)
    text = re.sub(r'\n{3,}', '\n\n', text)  # 减少多余空行
    # 删除多余内容
    start_index = text.find("## 剧情内容")
    # 截取从起始标记开始的内容
    truncated_text = text[start_index:]
    end_index = truncated_text.find("取自“")
    # 截取结束标记之前的内容
    text = truncated_text[:end_index]
    # print(text.strip())
    return text.strip()

# 创建文件
def create_folders_and_files(data_list, base_url):
    folder_count = 1
    for sublist in data_list:
        if not sublist:  # 跳过空子列表
            continue

        folder_name = str(folder_count)+sublist[0]  # 提取中文文件夹名
        folder_count+=1
        # 创建文件夹（如果不存在）
        os.makedirs(folder_name, exist_ok=True)
        chapter_count = 1
        for url_path in sublist[1:]:
            # 提取/sr/后的内容
            if "/sr/" not in url_path:
                print(f"跳过无效链接: {url_path}")
                continue

            encoded_part = url_path.split("/sr/", 1)[1]
            # URL解码获取中文文件名
            file_name = str(chapter_count)+urllib.parse.unquote(encoded_part) + ".txt"
            file_path = os.path.join(folder_name, file_name)

            # 构建完整URL
            full_url = base_url + url_path

            # 下载链接内容
            try:
                response = requests.get(full_url)
                text = html_to_markdown(response.text)

                # 写入文件
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(text)
                print(f"已创建文件: {file_path}")

            except RequestException as e:
                print(f"下载失败 {full_url}: {str(e)}")
            except Exception as e:
                print(f"处理错误: {str(e)}")
            chapter_count+=1
        chapter_count=1

# 使用示例
if __name__ == "__main__":
    print("start")
    BASE_URL="https://wiki.biligame.com"
    url = "https://wiki.biligame.com/sr/%E5%BC%80%E6%8B%93%E4%BB%BB%E5%8A%A1"
    create_folders_and_files(extract_chapters(url), BASE_URL)

2.将所有章节合并为一个文件，并针对性的对冗余信息进行剔除。

import os
import re
import glob


def natural_sort_key(s):
    """生成自然排序键，用于处理数字顺序"""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


def clean_title(name):
    """移除文件名开头的数字和分隔符，返回清理后的标题"""
    # 匹配开头的数字及其后的分隔符（点、空格、破折号等）
    match = re.match(r'^\d+[.\s_\-]*', name)
    if match:
        return name[match.end():].strip()
    return name.strip()


def merge_txt_files(root_path, output_file="XT1.txt"):
    """合并文件夹中的txt文件"""
    # 收集并排序文件夹（按开头的数字）
    folders = []
    for entry in os.scandir(root_path):
        if entry.is_dir() and re.match(r'^\d', entry.name):
            folders.append(entry.path)
    folders.sort(key=lambda x: natural_sort_key(os.path.basename(x)))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for folder in folders:
            # 添加一级标题（文件夹名称）
            folder_name = os.path.basename(folder)
            clean_folder = clean_title(folder_name)
            outfile.write(f"# {clean_folder}\n\n")

            # 收集并排序txt文件（按开头的数字）
            txt_files = glob.glob(os.path.join(folder, "*.txt"))
            txt_files = [f for f in txt_files if re.match(r'^\d', os.path.basename(f))]
            txt_files.sort(key=lambda x: natural_sort_key(os.path.basename(x)))

            for txt_file in txt_files:
                # 添加二级标题（文件名）
                file_name = os.path.splitext(os.path.basename(txt_file))[0]
                clean_file = clean_title(file_name)
                outfile.write(f"## {clean_file}\n\n")

                # 写入文件内容
                with open(txt_file, 'r', encoding='utf-8') as infile:
                    content = infile.read().strip()
                    outfile.write(process_text(content) + "\n\n")


def process_text(text):
    # 处理单独成行的短横线
    lines = []
    for line in text.splitlines():
        if line.strip() == '-':  # 检测单独成行的短横线
            line = line.replace('-', '', 1)  # 只删除第一个短横线
        lines.append(line)
    # 重新组合文本
    processed = '\n'.join(lines)
    # 删除"## 剧情内容"
    processed = processed.replace('## 剧情内容', '')
    # 替换"## 注释"为"注释"
    processed = processed.replace('## 注释', '注释')
    return processed

if __name__ == "__main__":
    # 使用示例 - 修改为你的实际路径
    base_path = "C:/Users/lgf/PycharmProjects/PythonProject2"  # 替换为你的文件夹路径
    merge_txt_files(base_path)
    print("finish,the txt saved in:"+base_path+"XT1.txt")