Fork me on GitHub

爬取百度搜索结果

使用python爬虫获取百度搜索结果。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import requests
from urllib.parse import quote
from bs4 import BeautifulSoup
class Crawler:
def __init__(self, timeout=5, max_pages=1):
self.timeout = timeout
self.max_pages = max_pages
self.headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def __call__(self, keywords):
url = 'https://www.baidu.com/baidu?wd=' + quote(keywords) + '&tn=monline_dg&ie=utf-8'
crawled_pages = 0
crawled_contents = []
while True:
html = self.crawl(url)
if len(html) == 0: break
soup = BeautifulSoup(html, 'html.parser')
results = soup.find_all("div", class_="c-abstract")
try:
for result in results:
crawled_contents.append(result.get_text().strip().replace(">>", ""))
except:
pass
crawled_pages += 1
if crawled_pages >= self.max_pages: break
next = soup.find('a', class_="n")
if not next: break
url = 'https://www.baidu.com' + next['href']
return keywords, crawled_contents
def crawl(self, url):
r = requests.get(url, timeout = self.timeout, headers = self.headers)
if r.status_code == 200:
return r.text
else:
print('[ERROR]', self.url, 'get此url返回的http状态码不是200')
return ''
if __name__ == "__main__":
crawler = Crawler()
print(crawler("虎门销烟是谁干的"))
donate the author