爬虫 # import requests from bs4 import BeautifulSoup prefix = 'http://mysql.taobao.org' # 获取文章名和url(文章名,url) def query_name_url(url: str): resp = requests.get(url) soup = BeautifulSoup(resp.content.decode('utf-8'), "html.parser") tags = soup.findAll('a', {'target': {'_top'}}) urls = [v for v in tags if v['href'].find('/monthly/') != -1] return [(str(v.string).strip(), prefix + v['href']) for v in urls] # 获取所有月报链接(月报名,url) def query_monthly_url(): resp = requests.get('http://mysql.taobao.org/monthly/') soup = BeautifulSoup(resp.content.decode('utf-8'), "html.parser") tags = soup.findAll('a', {'class': {'main'}}) urls = [v for v in tags if v['href'].find('/monthly/') != -1] return [(str(v.string).strip(), prefix + v['href']) for v in urls] # 获取所有文章名、URL和对应的月报链接(文章类型,文章名,url,月报名,url) def query_all_name_url(): result = [] monthly_urls = query_monthly_url() for data1 in monthly_urls: name_urls = query_name_url(data1[1]) for data2 in name_urls: result.