#需要安装的库:requests lxml import requests import re from lxml.html import etree import os import time #伪装头 headers = { 'referer': 'https://www.doutula.com/article/list/?page=3', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3783.0 Safari/537.36' } #先爬取100个页面 for s in range(1,100): url = 'https://www.doutula.com/article/list/?page={}'.format(s) try: res = requests.get(url,headers=headers) except Exception as e: continue csc = etree.HTML(res.text) ds = {} #解析出来每个页面的表情包标题和网址添加到ds字典 for i in csc.xpath('//*[@id="home"]/div/div[2]//a'): if i.xpath('@href')[0][0:4]=='http': ds[i.xpath('div[1]/text()')[0]] = i.xpath('@href')[0] #循环字典,并在指定目录创建文件夹,存放表情包 for a,b in ds.items(): file_path = '/Users/pyd/Documents/BigData/gs2/doutu/{}'.format(a) if not os.path.exists(file_path): os.mkdir(file_path) else: continue try: cbv = requests.get(b,headers=headers,timeout=5) except Exception as e: continuehj = re.compile('<img referrerpolicy="no-referrer" src="(.*?)" alt=".*?" onerror="this.*?>',re.S)
for c in re.findall(hj,cbv.text):
if str(c[-3:]) == 'jpg': try: resc = requests.get(c,headers=headers,timeout=5) with open('/Users/pyd/Documents/BigData/gs2/doutu/{}/{}.jpg'.format(a,str(time.time())),'wb') as f: f.write(resc.content) except Exception as e: continue elif str(c[-3:]).lower() == 'gif': try: resc = requests.get(c,headers=headers,timeout=5) with open('/Users/pyd/Documents/BigData/gs2/doutu/{}/{}.gif'.format(a,str(time.time())),'wb') as f: f.write(resc.content) except Exception as e: continue

庞玉栋个人博客、人生苦短-我用Python