爬取的网站是这个 http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html
from bs4 import BeautifulSoup import bs4 import requests import csv import pandas as pd def getHtmlText(url): try: head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} html = requests.get(url, headers=head) html.raise_for_status() html.encoding = html.apparent_encoding # 调整编码这一句千万别忘了 return html.text except: return 'error' def fillUnivList(uList, html): soup = BeautifulSoup(html, 'html.parser') for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): # 只要tag标签 tds = tr('td') aSchool = [] # 用来存放单个学校的信息 for i in range(14): aSchool.append(tds[i].string) uList.append(aSchool) return uList def printUnivList(uList, num): print(str(num) + "个学校的排名信息") head = ['排名', '学校', '地区', '总分', '生源得分', '培养结果', '学生声誉', '科研规模', '科研质量', '顶尖成果', '顶尖人才', '科技服务', '成果转化', '学生国际化'] print(head) for i in range(num): print(uList[i]) def saveData(data): head = ['排名', '学校', '地区', '总分', '生源得分', '培养结果', '学生声誉', '科研规模', '科研质量', '顶尖成果', '顶尖人才', '科技服务', '成果转化', '学生国际化'] # 使用csv保存 # 编码为utf-8-sig也很重要 # with open(r'c:\test\daxue.csv', 'w', newline='', encoding='utf-8-sig') as f: # writer = csv.writer(f) # writer.writerow(head) # for row in data: # writer.writerow(row) # 使用pandas保存 df = pd.DataFrame(data=uList) df.to_csv(r"c:\test\中国大学排名.csv", encoding="utf-8-sig", mode="w+", header=head, index=False) print('保存成功') if __name__ == '__main__': uList = [] url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html' html = getHtmlText(url) fillUnivList(uList, html) printUnivList(uList, 549) saveData(uList)