爬虫

import requests
from bs4 import BeautifulSoup

prefix = 'http://mysql.taobao.org'


# 获取文章名和url(文章名,url)
def query_name_url(url: str):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content.decode('utf-8'), "html.parser")
    tags = soup.findAll('a', {'target': {'_top'}})
    urls = [v for v in tags if v['href'].find('/monthly/') != -1]
    return [(str(v.string).strip(), prefix + v['href']) for v in urls]


# 获取所有月报链接(月报名,url)
def query_monthly_url():
    resp = requests.get('http://mysql.taobao.org/monthly/')
    soup = BeautifulSoup(resp.content.decode('utf-8'), "html.parser")
    tags = soup.findAll('a', {'class': {'main'}})
    urls = [v for v in tags if v['href'].find('/monthly/') != -1]
    return [(str(v.string).strip(), prefix + v['href']) for v in urls]


# 获取所有文章名、URL和对应的月报链接(文章类型,文章名,url,月报名,url)
def query_all_name_url():
    result = []

    monthly_urls = query_monthly_url()
    for data1 in monthly_urls:
        name_urls = query_name_url(data1[1])
        for data2 in name_urls:
            result.append((data2[0][0:data2[0].find('·')].strip(), data2[0], data2[1], data1[0], data1[1]))
    return result


# 下载所有数据库月报,并按照类别进行分类,写入到mysql.md文件中
name = ''
data = []
result = query_all_name_url()
result.sort(key=lambda v: v[0])
for v in result:
    if v[0] != name:
        name = v[0]
        data.append('## {}'.format(v[0]))
    data.append('[{}]({}) [[{}]({})]\n'.format(v[1], v[2], v[3][v[3].find(' - ') + 3:], v[4]))

with open('mysql.md', 'w') as file:
    for v in data:
        print(v)
        file.write(v + '\n')