庞玉栋

爬取斗图网表情包

发布时间:10个月前热度: 821 ℃评论数:
#需要安装的库:requests lxml
import requests
import re
from lxml.html import etree
import os
import time
#伪装头
headers = {
    'referer': 'https://www.doutula.com/article/list/?page=3',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3783.0 Safari/537.36'
}
#先爬取100个页面
for s in range(1,100):
    url = 'https://www.doutula.com/article/list/?page={}'.format(s)
    try:
        res = requests.get(url,headers=headers)
    except Exception as e:
        continue
    csc = etree.HTML(res.text)
    ds = {}
    #解析出来每个页面的表情包标题和网址添加到ds字典
    for i in csc.xpath('//*[@id="home"]/div/div[2]//a'):
        if i.xpath('@href')[0][0:4]=='http':
            ds[i.xpath('div[1]/text()')[0]] = i.xpath('@href')[0]
    #循环字典,并在指定目录创建文件夹,存放表情包
    for a,b in ds.items():
        file_path = '/Users/pyd/Documents/BigData/gs2/doutu/{}'.format(a)
        if not os.path.exists(file_path):
            os.mkdir(file_path)
        else:
            continue
        try:
            cbv = requests.get(b,headers=headers,timeout=5)
        except Exception as e:
            continue

hj = re.compile('<img referrerpolicy="no-referrer" src="(.*?)" alt=".*?" onerror="this.*?>',re.S)

for c in re.findall(hj,cbv.text):

if str(c[-3:]) == 'jpg': try: resc = requests.get(c,headers=headers,timeout=5) with open('/Users/pyd/Documents/BigData/gs2/doutu/{}/{}.jpg'.format(a,str(time.time())),'wb') as f: f.write(resc.content) except Exception as e: continue elif str(c[-3:]).lower() == 'gif': try: resc = requests.get(c,headers=headers,timeout=5) with open('/Users/pyd/Documents/BigData/gs2/doutu/{}/{}.gif'.format(a,str(time.time())),'wb') as f: f.write(resc.content) except Exception as e: continue

error

手机扫码访问