中国大学排名爬虫

爬取的网站是这个 http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html

from bs4 import BeautifulSoup
import bs4
import requests
import csv
import pandas as pd


def getHtmlText(url):
    try:
        head = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        html = requests.get(url, headers=head)
        html.raise_for_status()
        html.encoding = html.apparent_encoding  # 调整编码这一句千万别忘了
        return html.text
    except:
        return 'error'


def fillUnivList(uList, html):
    soup = BeautifulSoup(html, 'html.parser')
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):  # 只要tag标签
            tds = tr('td')
            aSchool = []  # 用来存放单个学校的信息
            for i in range(14):
                aSchool.append(tds[i].string)
            uList.append(aSchool)
    return uList


def printUnivList(uList, num):
    print(str(num) + "个学校的排名信息")
    head = ['排名', '学校', '地区', '总分',
            '生源得分', '培养结果', '学生声誉', '科研规模',
            '科研质量', '顶尖成果', '顶尖人才', '科技服务',
            '成果转化', '学生国际化']
    print(head)
    for i in range(num):
        print(uList[i])


def saveData(data):
    head = ['排名', '学校', '地区', '总分',
            '生源得分', '培养结果', '学生声誉', '科研规模',
            '科研质量', '顶尖成果', '顶尖人才', '科技服务',
            '成果转化', '学生国际化']
    # 使用csv保存
    # 编码为utf-8-sig也很重要
    # with open(r'c:\test\daxue.csv', 'w', newline='', encoding='utf-8-sig') as f:
    #     writer = csv.writer(f)
    #     writer.writerow(head)
    #     for row in data:
    #         writer.writerow(row)

    # 使用pandas保存
    df = pd.DataFrame(data=uList)
    df.to_csv(r"c:\test\中国大学排名.csv", encoding="utf-8-sig", mode="w+", header=head, index=False)

    print('保存成功')


if __name__ == '__main__':
    uList = []
    url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html'
    html = getHtmlText(url)
    fillUnivList(uList, html)
    printUnivList(uList, 549)
    saveData(uList)

 

点赞

发表评论

电子邮件地址不会被公开。必填项已用 * 标注

19 − 10 =