庞玉栋

猎聘网站爬虫(代理池+邮件提醒+写入flume服务端)

发布时间:10个月前热度: 1220 ℃评论数:
# coding=utf-8
import requests
import re
import time
import json
from demo2 import soct
import pymysql
import random
import smtplib
import email.mime.multipart
import email.mime.text

import socket

class Spider(): def __init__(self): self.ls = [] self.count = 0 self.USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] def emial(self): try: subject = 'python执行日志' content = "可用{}个代理ip,所有url已经爬取完毕".format(len(self.ls)) msg = email.mime.multipart.MIMEMultipart()

msg['from'] = '******@qq.com'

msg['to'] = '******@qq.com'

msg['subject'] = subject content = content txt = email.mime.text.MIMEText(content, 'plain', 'utf-8') msg.attach(txt) smtpHost = 'smtp.qq.com' smtpPort = '25' sslPort = '465' smtp = smtplib.SMTP_SSL(smtpHost,sslPort) smtp.ehlo()

smtp.login('******@qq.com', '******key')

smtp.sendmail('******@qq.com', '******@qq.com', str(msg))

print("发送成功!") smtp.quit() except Exception as e: print(e)

def soct(self,str_r):

try:
address = ('******', 5555)
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.connect(address) #如果连接成功返回0
kk = str(str_r)+"\n"
client.sendall(kk.encode()) #发送数据
client.settimeout(10)
nn = "成功:"
client.close()
return nn
except Exception as e:
return "失败 "+str(e)

def sql(self): try:

db = pymysql.connect(host="******", user="history", password="******", db="test", port=3306,

charset="utf8") cur = db.cursor() sql = "SELECT * FROM proxy WHERE type=0" cur.execute(sql) # 执行sql语句 results = cur.fetchall() # 获取查询的所有记录 for row in results: head = row[0] ip = row[1] poxy = row[2] if str(head) == 'HTTP': self.ls.append(ip+':'+poxy) print('可用{}个代理ip,爬虫开始'.format(len(self.ls))) self.crawler_data() except Exception as e: print(e) def crawler_data(self): i = 0 for n in ['架构师','投资经理', '土建工程师', '采购', '销售', '运营', 'java', '产品经理', '人力资源', '总经理', 'Python', 'iOS', 'PHP', 'Android', 'UI设计师','测试工程师','自动化测试','功能测试','性能测试','测试开发','硬件测试','运维工程师','系统工程师' ,'网络工程师' ,'运维开发' ,'DBA', '技术经理', '架构师' ,'技术总监', 'CTO', '技术合伙人', '运维总监', '安全专家' ,'项目总监', 'Html5', '产品经理', '网页产品经理', '移动产品经理', '产品助理', '数据产品经理', '游戏策划', '电商产品经理']: try: while True: url = 'https://www.liepin.com/zhaopin/?industryType=&jobKind=&sortFlag=15°radeFlag=0&industries=&salary=&compscale=&key={}&clean_condition=&headckid=4a4adb68b22970bd&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~fA9rXquZc5IkJpXC-Ycixw&d_headId=62ac45351cdd7a103ac7d50e1142b2a0&d_ckId=62ac45351cdd7a103ac7d50e1142b2a0&d_sfrom=search_fp&d_curPage=0&curPage={}'.format( n, random.randint(1, 30)) self.request_job_list(url) i = i + 1 if i == 20: i = 0 break except Exception as e: print(e) self.emial() def request_job_list(self, url): try: print(url) headers = { 'Referer': 'https://www.liepin.com/', 'User-Agent': random.choice(self.USER_AGENTS) } reponse = requests.get(url, proxies={'http':random.choice(self.ls)}, headers=headers, timeout = 5) ret = random.uniform(0, 2) print(reponse.text) # time.sleep(ret) pattern = re.compile('
' '.*?.*?.*?

.*?>(.*?).*?

.*?target="_blank">(.*?).*?', re.S) datas = re.findall(pattern, reponse.text) for data in datas: title = data[0] href = data[1] result = data[2].split('_') salary = result[0] region = result[1] degree = result[2] experience = result[3] name = data[3] industry = data[4] href = '/'+href.split('/')[-2]+'/'+href.split('/')[-1] self.request_job_details(title, salary, region, degree,experience, name, industry,href) if reponse.status_code != 200: return 0 except Exception as e: print('request_job_list error : {}'.format(e)) def request_job_details(self,title, salary, region, degree, experience, name, industry, url): try: url_1 = 'https://www.liepin.com'+url headers = { 'User-Agent': random.choice(self.USER_AGENTS) } response = requests.get(url_1, proxies={'http':random.choice(self.ls)}, headers=headers, timeout = 5) ret = random.uniform(0, 2) # time.sleep(ret) self.parse_job_details(title, salary, region, degree,experience, name, industry,response.text) except Exception as e: print(e) def parse_job_details(self,title, salary, region, degree, experience, name, industry, text): try: tiems = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) gsgm_url = re.findall('

  • 公司规模:(.*?)
  • ',text) gsdz_url = re.findall('"dqName": "(.*?)",',text) try: gsdz = gsdz_url[0] except Exception: gsdz = "无" try: gsgml = gsgm_url[0] except Exception: gsgml = "无" pattern = re.compile( '
    (.*?)
    .*?
    ', re.S) text = re.search(pattern, text) detail = re.sub(re.compile('<[^>]+>', re.S), '', text.group(1)) dists = {'公司名称':name,'所属行业':industry,'公司规模':gsgml,'公司地址':region, '工作地点':gsdz, '采集时间':tiems, '工资情况':salary, '学历要求':degree, '工作经验':experience,'招聘岗位': title.strip('招聘'), '职位描述':detail} jsons = json.dumps(dists, ensure_ascii=False, indent=2) hh = str(title.strip('招聘')).split('/') bb = '' for n in hh: bb = n+bb with open('data/{}.json'.format(name.split('/')[0]+'-'+bb.split('/')[0]), 'w',encoding='utf-8') as f: f.write(jsons) li = [name,industry,gsgml,region,gsdz, tiems,salary, degree,experience,title.strip('招聘'),detail] nn = self.soct(li) self.count += 1 print('写入次数:{},发送到服务端状态:{}'.format(self.count,nn)) except Exception as e: print("re parse_job_list error : ", str(e)) if __name__ == "__main__": mm = Spider() mm.sql()

    代理,邮件,提醒

    手机扫码访问