检查下面的代码,指出问题并改正: # coding: utf-8 import re import wikipediaapi import json import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from opencc import OpenCC cc = OpenCC('t2s') # 重试 session = requests.Session() retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) adapter = HTTPAdapter(max_retries=retry) session.mount('https://', adapter) wiki = wikipediaapi.Wikipedia( language='zh', user_agent="MyProjectName ([email protected])" # 只需提供语言和 user_agent ) string_list = ['Wikipedia',"维基", "英語","英语","Category",":","/"] def collect_content(page, depth=1, visited=None): if visited is None: visited = set() if depth <= 0 or page.title in visited: print("到头了") return [] visited.add(page.title) print(f"加入 {page.title} 页面作为已访问") # 获取页面类别 categories = [cc.convert(category.title[9:]) for category in page.categories.values() if len(category.title[9:])<=6] print(categories) new_text = page.text.replace(page.summary, "") new_text2 = re.sub(r"{\\displaystyle([^}]*)}|[\n ]", "", new_text) content_list = [ {"concept": cc.convert(page.title), "summary": cc.convert(re.sub(r"{\\displaystyle([^}]*)}|[\n ]", "", page.summary)), "text": cc.convert(new_text2), "upclass": "机器学习", "categories": categories} ] print(f"{'#'* depth}第{depth}层: {page.title}") try: ## 获取link if len(visited) > 1: if len(page.links.keys()) <=100: print("---存在link_key") ## 递归条件结束,如果下一次递归深度为0则直接不递归 if depth -1 != 0 : for link_title in page.links.keys(): # 查看是否有具体文字: if any(string not in link_title for string in string_list): sub_page3 = wiki.page(link_title) if sub_page3.exists(): print(f"{'#'* (depth - 1)}第{depth - 1}层子元素link,启动: {sub_page3.title}") content_list.extend(collect_content(sub_page3, depth - 1, visited)) else: print(f"页面不存在: {link_title}") else: print("别递归link了,预测你到头了") except KeyError as e: print(f"link_API 响应中缺少字段: {e}") # 捕获 KeyError 并继续处理其他页面 pass except Exception as e: print(f"link_发生其他错误: {e}") # 捕获其他异常并继续处理其他页面 pass try: # 获取分类成员 if len(page.categorymembers.keys())<=100: print("---存在cate_title:") ## 递归条件结束,如果下一次递归深度为0则直接不递归 if depth - 1 != 0: for cate_title in page.categorymembers.keys(): # 查看是否有具体文字: if any(string not in cate_title for string in string_list): sub_page = wiki.page(cate_title) if sub_page.exists(): print(f"{'#'* (depth - 1)}第{depth-1}层子元素category,启动: {sub_page.title}") content_list.extend(collect_content(sub_page, depth - 1, visited)) else: print(f"页面不存在: {cate_title}") else: print("别递归cate了,预测你到头了") except KeyError as e: print(f"cate_API 响应中缺少字段: {e}") # 捕获 KeyError 并继续处理其他页面 pass except Exception as e: print(f"cate_发生其他错误: {e}") # 捕获其他异常并继续处理其他页面 pass try: # 获取反向链接(可选,建议移除) if len(page.backlinks.keys())<=100: print("---存在back_title:") ## 递归条件结束,如果下一次递归深度为0则直接不递归 if depth - 1 != 0: for backLink_title in page.backlinks.keys(): # 查看是否有具体文字: if any(string not in backLink_title for string in string_list): sub_page2 = wiki.page(backLink_title) if sub_page2.exists(): print(f"{'#'* (depth - 1)}第{depth-1}层子元素backlink,启动: {sub_page2.title}") content_list.extend(collect_content(sub_page2, depth - 1, visited)) else: print(f"页面不存在: {backLink_title}") else: print("别递归back了,预测你到头了") except KeyError as e: print(f"back_API 响应中缺少字段: {e}") # 捕获 KeyError 并继续处理其他页面 pass except Exception as e: print(f"back_发生其他错误: {e}") # 捕获其他异常并继续处理其他页面 pass return content_list # def save_to_json(data, output_file): # try: # with open(output_file, "r", encoding="utf-8") as f: # existing_data = json.load(f) # except (FileNotFoundError, json.JSONDecodeError): # existing_data = [] # # existing_data.extend(data) # # with open(output_file, "w", encoding="utf-8") as f: # json.dump(existing_data, f, ensure_ascii=False, indent=4) def save_to_json(data, output_file): """ 将每个数据项以 JSON 格式作为一行写入文件。 :param data: 要保存的数据列表,每个元素是一个字典 :param output_file: 输出文件路径 """ # 打开文件,使用 'a' 模式进行追加写入 with open(output_file, "a", encoding="utf-8") as f: for item in data: # 将每个数据项转换为 JSON 字符串并写入文件,每行一个 JSON 对象 json_line = json.dumps(item, ensure_ascii=False) f.write(json_line + "\n") root_page = wiki.page("Category:机器学习") output_file = "machine_learning.json" recursion_depth = 3 if root_page.exists(): print(f"收集 '{root_page.title}'...") content_data = collect_content(root_page, depth=recursion_depth) save_to_json(content_data, output_file) print(f"文件:'{output_file}' ") else: print(f"'{root_page.title}' 不存在。") # # ## 先把title为损失函数的页面跑出来演示: # page_py = wiki.page("Category:Deep learning") # # print("打印中文:") # # language = "zh" # # lpage = page_py.langlinks[language] # # print("中文部分:") # lpage = page_py # # ti = lpage.title # print(f"题目:{ti}") # # su = lpage.summary # print(f"总结:{su}") # # print(f"########backLinks############") # Property_backlinks = lpage.backlinks # for key in Property_backlinks.keys(): # if len(key.split('/')) >= 2: # continue # print(f"backlinks: {key}") # print(f"########categoryMembers############") # Property_categoryMembers = lpage.categorymembers # for key in Property_categoryMembers.keys(): # print(f"categoryMembers: {key}") # print(f"##################links################") # Property_links = lpage.links # for key in Property_links.keys(): # print(f"links: {key}") # # for index, text in enumerate(lpage.categories.values()): # # if index == 1: # # c = text # # print("###################") # # print(f"这是精心挑选的子分类:{c.title}") # # print("请欣赏子分类的内容summary和category:") # # print(c.summary) # # print("%%%%%%%%%%%%") # # print([cc.title for cc in c.categories.values()]) # # print("######################") # # else: # # print(f"子分类:{text.title}") # # categories = [category.title[9:] for category in lpage.categories.values()] # print(f"Categories: {categories}") # # # def print_sections(sections, level=0): # # for section in sections: # # print(" " * level + section.title) # # print_sections(section.sections, level + 1) # # print("Sections:") # print(lpage.sections) # # print_sections(lpage.sections) # # te = lpage.text # # print(te) # # # ca = lpage.categories # print("打印chinese全文部分:") # print(lpage.text) # print("打印chinese section部分:") # se = lpage.sections # print(se)

2024-12-17by@qjs13667164495-kBu46&@GPT-4.Model: gpt-3.5
qjs13667164495-kBu46

检查下面的代码,指出问题并改正:

coding: utf-8

import re

import wikipediaapi
import json
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from opencc import OpenCC

cc = OpenCC('t2s')

重试

session = requests.Session()
retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
wiki = wikipediaapi.Wikipedia(
language='zh',
user_agent="MyProjectName ([email protected])" # 只需提供语言和 user_agent
)

string_list = ['Wikipedia',"维基", "英語","英语","Category",":","/"]
def collect_content(page, depth=1, visited=None):

if visited is None:
    visited = set()

if depth <= 0 or page.title in visited:
    print("到头了")
    return []

visited.add(page.title)
print(f"加入 {page.title} 页面作为已访问")

# 获取页面类别
categories = [cc.convert(category.title[9:]) for category in page.categories.values() if len(category.title[9:])<=6]
print(categories)
new_text = page.text.replace(page.summary, "")
new_text2 = re.sub(r"{\\displaystyle([^}]*)}|[\n ]", "", new_text)
content_list = [
    {"concept": cc.convert(page.title), "summary": cc.convert(re.sub(r"{\\displaystyle([^}]*)}|[\n ]", "", page.summary)), "text": cc.convert(new_text2), "upclass": "机器学习", "categories": categories}
]

print(f"{'#'* depth}第{depth}层: {page.title}")

try:
    ## 获取link
    if len(visited) > 1:
        if len(page.links.keys()) <=100:
            print("---存在link_key")
            ## 递归条件结束,如果下一次递归深度为0则直接不递归
            if depth -1 != 0 :
                for link_title in page.links.keys():
                    # 查看是否有具体文字:
                    if any(string not in link_title for string in string_list):
                        sub_page3 = wiki.page(link_title)
                        if sub_page3.exists():
                            print(f"{'#'* (depth - 1)}第{depth - 1}层子元素link,启动: {sub_page3.title}")
                            content_list.extend(collect_content(sub_page3, depth - 1, visited))
                        else:
                            print(f"页面不存在: {link_title}")
            else:
                print("别递归link了,预测你到头了")
except KeyError as e:
    print(f"link_API 响应中缺少字段: {e}")
    # 捕获 KeyError 并继续处理其他页面
    pass
except Exception as e:
    print(f"link_发生其他错误: {e}")
    # 捕获其他异常并继续处理其他页面
    pass

try:
    # 获取分类成员
    if len(page.categorymembers.keys())<=100:
        print("---存在cate_title:")
        ## 递归条件结束,如果下一次递归深度为0则直接不递归
        if depth - 1 != 0:
            for cate_title in page.categorymembers.keys():
                # 查看是否有具体文字:
                if any(string not in cate_title for string in string_list):
                    sub_page = wiki.page(cate_title)
                    if sub_page.exists():
                        print(f"{'#'* (depth - 1)}第{depth-1}层子元素category,启动: {sub_page.title}")
                        content_list.extend(collect_content(sub_page, depth - 1, visited))
                    else:
                        print(f"页面不存在: {cate_title}")
        else:
            print("别递归cate了,预测你到头了")
except KeyError as e:
    print(f"cate_API 响应中缺少字段: {e}")
    # 捕获 KeyError 并继续处理其他页面
    pass

except Exception as e:
    print(f"cate_发生其他错误: {e}")
    # 捕获其他异常并继续处理其他页面
    pass

try:
    # 获取反向链接(可选,建议移除)
    if len(page.backlinks.keys())<=100:
        print("---存在back_title:")
        ## 递归条件结束,如果下一次递归深度为0则直接不递归
        if depth - 1 != 0:
            for backLink_title in page.backlinks.keys():
                # 查看是否有具体文字:
                if any(string not in backLink_title for string in string_list):
                    sub_page2 = wiki.page(backLink_title)
                    if sub_page2.exists():
                        print(f"{'#'* (depth - 1)}第{depth-1}层子元素backlink,启动: {sub_page2.title}")
                        content_list.extend(collect_content(sub_page2, depth - 1, visited))
                    else:
                        print(f"页面不存在: {backLink_title}")
        else:
            print("别递归back了,预测你到头了")
except KeyError as e:
    print(f"back_API 响应中缺少字段: {e}")
    # 捕获 KeyError 并继续处理其他页面
    pass

except Exception as e:
    print(f"back_发生其他错误: {e}")
    # 捕获其他异常并继续处理其他页面
    pass


return content_list

def save_to_json(data, output_file):

try:

with open(output_file, "r", encoding="utf-8") as f:

existing_data = json.load(f)

except (FileNotFoundError, json.JSONDecodeError):

existing_data = []

existing_data.extend(data)

with open(output_file, "w", encoding="utf-8") as f:

json.dump(existing_data, f, ensure_ascii=False, indent=4)

def save_to_json(data, output_file):
"""
将每个数据项以 JSON 格式作为一行写入文件。

:param data: 要保存的数据列表,每个元素是一个字典
:param output_file: 输出文件路径
"""
# 打开文件,使用 'a' 模式进行追加写入
with open(output_file, "a", encoding="utf-8") as f:
    for item in data:
        # 将每个数据项转换为 JSON 字符串并写入文件,每行一个 JSON 对象
        json_line = json.dumps(item, ensure_ascii=False)
        f.write(json_line + "\n")

root_page = wiki.page("Category:机器学习")
output_file = "machine_learning.json"
recursion_depth = 3
if root_page.exists():
print(f"收集 '{root_page.title}'...")

content_data = collect_content(root_page, depth=recursion_depth)
save_to_json(content_data, output_file)
print(f"文件:'{output_file}' ")

else:
print(f"'{root_page.title}' 不存在。")

## 先把title为损失函数的页面跑出来演示:

page_py = wiki.page("Category:Deep learning")

# print("打印中文:")

# language = "zh"

# lpage = page_py.langlinks[language]

print("中文部分:")

lpage = page_py

ti = lpage.title

print(f"题目:{ti}")

su = lpage.summary

print(f"总结:{su}")

print(f"########backLinks############")

Property_backlinks = lpage.backlinks

for key in Property_backlinks.keys():

if len(key.split('/')) >= 2:

continue

print(f"backlinks: {key}")

print(f"########categoryMembers############")

Property_categoryMembers = lpage.categorymembers

for key in Property_categoryMembers.keys():

print(f"categoryMembers: {key}")

print(f"##################links################")

Property_links = lpage.links

for key in Property_links.keys():

print(f"links: {key}")

# for index, text in enumerate(lpage.categories.values()):

# if index == 1:

# c = text

# print("###################")

# print(f"这是精心挑选的子分类:{c.title}")

# print("请欣赏子分类的内容summary和category:")

# print(c.summary)

# print("%%%%%%%%%%%%")

# print([cc.title for cc in c.categories.values()])

# print("######################")

# else:

# print(f"子分类:{text.title}")

categories = [category.title[9:] for category in lpage.categories.values()]

print(f"Categories: {categories}")

# def print_sections(sections, level=0):

# for section in sections:

# print(" " * level + section.title)

# print_sections(section.sections, level + 1)

print("Sections:")

print(lpage.sections)

# print_sections(lpage.sections)

# te = lpage.text

# print(te)

# ca = lpage.categories

print("打印chinese全文部分:")

print(lpage.text)

print("打印chinese section部分:")

se = lpage.sections

print(se)

Avatar
Avatar

在代码中没有明确指定root_page的语言,可能导致无法正确获取页面。需要在wiki.page中传入language参数指定语言。

更改为:

root_page = wiki.page("Category:机器学习", language='zh')