检查下面的代码,指出问题并改正:
import re
import wikipediaapi
import json
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from opencc import OpenCC
cc = OpenCC('t2s')
session = requests.Session()
retry = Retry(total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
wiki = wikipediaapi.Wikipedia(
language='zh',
user_agent="MyProjectName ([email protected])" # 只需提供语言和 user_agent
)
string_list = ['Wikipedia',"维基", "英語","英语","Category",":","/"]
def collect_content(page, depth=1, visited=None):
if visited is None:
visited = set()
if depth <= 0 or page.title in visited:
print("到头了")
return []
visited.add(page.title)
print(f"加入 {page.title} 页面作为已访问")
# 获取页面类别
categories = [cc.convert(category.title[9:]) for category in page.categories.values() if len(category.title[9:])<=6]
print(categories)
new_text = page.text.replace(page.summary, "")
new_text2 = re.sub(r"{\\displaystyle([^}]*)}|[\n ]", "", new_text)
content_list = [
{"concept": cc.convert(page.title), "summary": cc.convert(re.sub(r"{\\displaystyle([^}]*)}|[\n ]", "", page.summary)), "text": cc.convert(new_text2), "upclass": "机器学习", "categories": categories}
]
print(f"{'#'* depth}第{depth}层: {page.title}")
try:
## 获取link
if len(visited) > 1:
if len(page.links.keys()) <=100:
print("---存在link_key")
## 递归条件结束,如果下一次递归深度为0则直接不递归
if depth -1 != 0 :
for link_title in page.links.keys():
# 查看是否有具体文字:
if any(string not in link_title for string in string_list):
sub_page3 = wiki.page(link_title)
if sub_page3.exists():
print(f"{'#'* (depth - 1)}第{depth - 1}层子元素link,启动: {sub_page3.title}")
content_list.extend(collect_content(sub_page3, depth - 1, visited))
else:
print(f"页面不存在: {link_title}")
else:
print("别递归link了,预测你到头了")
except KeyError as e:
print(f"link_API 响应中缺少字段: {e}")
# 捕获 KeyError 并继续处理其他页面
pass
except Exception as e:
print(f"link_发生其他错误: {e}")
# 捕获其他异常并继续处理其他页面
pass
try:
# 获取分类成员
if len(page.categorymembers.keys())<=100:
print("---存在cate_title:")
## 递归条件结束,如果下一次递归深度为0则直接不递归
if depth - 1 != 0:
for cate_title in page.categorymembers.keys():
# 查看是否有具体文字:
if any(string not in cate_title for string in string_list):
sub_page = wiki.page(cate_title)
if sub_page.exists():
print(f"{'#'* (depth - 1)}第{depth-1}层子元素category,启动: {sub_page.title}")
content_list.extend(collect_content(sub_page, depth - 1, visited))
else:
print(f"页面不存在: {cate_title}")
else:
print("别递归cate了,预测你到头了")
except KeyError as e:
print(f"cate_API 响应中缺少字段: {e}")
# 捕获 KeyError 并继续处理其他页面
pass
except Exception as e:
print(f"cate_发生其他错误: {e}")
# 捕获其他异常并继续处理其他页面
pass
try:
# 获取反向链接(可选,建议移除)
if len(page.backlinks.keys())<=100:
print("---存在back_title:")
## 递归条件结束,如果下一次递归深度为0则直接不递归
if depth - 1 != 0:
for backLink_title in page.backlinks.keys():
# 查看是否有具体文字:
if any(string not in backLink_title for string in string_list):
sub_page2 = wiki.page(backLink_title)
if sub_page2.exists():
print(f"{'#'* (depth - 1)}第{depth-1}层子元素backlink,启动: {sub_page2.title}")
content_list.extend(collect_content(sub_page2, depth - 1, visited))
else:
print(f"页面不存在: {backLink_title}")
else:
print("别递归back了,预测你到头了")
except KeyError as e:
print(f"back_API 响应中缺少字段: {e}")
# 捕获 KeyError 并继续处理其他页面
pass
except Exception as e:
print(f"back_发生其他错误: {e}")
# 捕获其他异常并继续处理其他页面
pass
return content_list
def save_to_json(data, output_file):
"""
将每个数据项以 JSON 格式作为一行写入文件。
:param data: 要保存的数据列表,每个元素是一个字典
:param output_file: 输出文件路径
"""
# 打开文件,使用 'a' 模式进行追加写入
with open(output_file, "a", encoding="utf-8") as f:
for item in data:
# 将每个数据项转换为 JSON 字符串并写入文件,每行一个 JSON 对象
json_line = json.dumps(item, ensure_ascii=False)
f.write(json_line + "\n")
root_page = wiki.page("Category:机器学习")
output_file = "machine_learning.json"
recursion_depth = 3
if root_page.exists():
print(f"收集 '{root_page.title}'...")
content_data = collect_content(root_page, depth=recursion_depth)
save_to_json(content_data, output_file)
print(f"文件:'{output_file}' ")
else:
print(f"'{root_page.title}' 不存在。")
在代码中没有明确指定root_page
的语言,可能导致无法正确获取页面。需要在wiki.page
中传入language
参数指定语言。
更改为:
root_page = wiki.page("Category:机器学习", language='zh')