爬取百度搜索结果 发表于 2020-05-15 | 分类于 python | 热度 ℃ | 字数统计 225 字 | 阅读时长 1 分钟 使用python爬虫获取百度搜索结果。 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849import requestsfrom urllib.parse import quotefrom bs4 import BeautifulSoupclass Crawler: def __init__(self, timeout=5, max_pages=1): self.timeout = timeout self.max_pages = max_pages self.headers = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } def __call__(self, keywords): url = 'https://www.baidu.com/baidu?wd=' + quote(keywords) + '&tn=monline_dg&ie=utf-8' crawled_pages = 0 crawled_contents = [] while True: html = self.crawl(url) if len(html) == 0: break soup = BeautifulSoup(html, 'html.parser') results = soup.find_all("div", class_="c-abstract") try: for result in results: crawled_contents.append(result.get_text().strip().replace(">>", "")) except: pass crawled_pages += 1 if crawled_pages >= self.max_pages: break next = soup.find('a', class_="n") if not next: break url = 'https://www.baidu.com' + next['href'] return keywords, crawled_contents def crawl(self, url): r = requests.get(url, timeout = self.timeout, headers = self.headers) if r.status_code == 200: return r.text else: print('[ERROR]', self.url, 'get此url返回的http状态码不是200') return ''if __name__ == "__main__": crawler = Crawler() print(crawler("虎门销烟是谁干的")) donate the author Donate WeChat Pay Alipay