# -*- coding: utf-8 -*- import requests import re import time import json import pymysql import random import smtplib import email.mime.multipart import email.mime.text class Spider(): def __init__(self): self.ls = [] self.count = 0 self.USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] #程序执行完毕后发送邮件函数 def emial(self): try: subject = 'python执行日志' content = "可用{}个代理ip,所有url已经爬取完毕".format(len(self.ls)) msg = email.mime.multipart.MIMEMultipart() msg['from'] = 'xxxxx@qq.com' msg['to'] = 'xxxx@qq.com' msg['subject'] = subject content = content txt = email.mime.text.MIMEText(content, 'plain', 'utf-8') msg.attach(txt) smtpHost = 'smtp.qq.com' smtpPort = '25' sslPort = '465' smtp = smtplib.SMTP_SSL(smtpHost, sslPort) smtp.ehlo() smtp.login('xxxxx@qq.com', 'xxxxx') smtp.sendmail('xxxx@qq.com', 'xxxxx@qq.com', str(msg)) print("发送成功!") smtp.quit() except Exception as e: print(e) #从数据库中查询代理函数 def sql(self): try: db = pymysql.connect(host="xxxxxxxxxx", user="xxxxxx", password="xxxxxx", db="xxxxx", port=3306, charset="utf8") cur = db.cursor() sql = "SELECT * FROM proxy WHERE type=0" cur.execute(sql) # 执行sql语句 results = cur.fetchall() # 获取查询的所有记录 for row in results: head = row[0] ip = row[1] poxy = row[2] if str(head) == 'HTTP': self.ls.append(ip + ':' + poxy) print('可用{}个代理ip,爬虫开始'.format(len(self.ls))) self.crawler_data() except Exception as e: print(e) #爬虫入口,负责构建url def crawler_data(self): i = 0 for n in ['投资经理', '土建工程师', '采购', '销售', '运营', 'java', '产品经理', '人力资源', '总经理', 'Python', 'iOS', 'PHP', 'Android', 'UI设计师', '测试工程师', '自动化测试', '功能测试', '性能测试', '测试开发', '硬件测试', '运维工程师', '系统工程师', '网络工程师', '运维开发', 'DBA', '技术经理', '架构师', '技术总监', 'CTO', '技术合伙人', '运维总监', '安全专家', '项目总监', 'Html5', '产品经理', '网页产品经理', '移动产品经理', '产品助理', '数据产品经理', '游戏策划', '电商产品经理']: try: while True: url = 'https://www.liepin.com/zhaopin/?industryType=&jobKind=&sortFlag=15°radeFlag=0&industries=&salary=&compscale=&key={}&clean_condition=&headckid=4a4adb68b22970bd&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~fA9rXquZc5IkJpXC-Ycixw&d_headId=62ac45351cdd7a103ac7d50e1142b2a0&d_ckId=62ac45351cdd7a103ac7d50e1142b2a0&d_sfrom=search_fp&d_curPage=0&curPage={}'.format( n, random.randint(1, 20)) self.request_job_list(url) print(url) i = i + 1 if i == 10: i = 0 break except Exception as e: print(e) self.emial() #接受传递来的url 进行解析页面数据,传递数据给下一个函数 def request_job_list(self, url): try: print(url) headers = { 'Referer': 'https://www.liepin.com/', 'User-Agent': random.choice(self.USER_AGENTS) } reponse = requests.get(url, proxies={'http': random.choice(self.ls)}, headers=headers, timeout=5) ret = random.uniform(0, 2) time.sleep(ret) pattern = re.compile('' '.*?.*?.*? .*?>(.*?).*?
.*?target="_blank">(.*?).*?', re.S) datas = re.findall(pattern, reponse.text) for data in datas: title = data[0] href = data[1] result = data[2].split('_') salary = result[0] region = result[1] degree = result[2] experience = result[3] name = data[3] industry = data[4] href = '/' + href.split('/')[-2] + '/' + href.split('/')[-1] self.request_job_details(title, salary, region, degree, experience, name, industry, href) if reponse.status_code != 200: return 0 except Exception as e: print('request_job_list error : {}'.format(e)) #通过传递来的url,解析url页面,提取数据 def request_job_details(self, title, salary, region, degree, experience, name, industry, url): try: url_1 = 'https://www.liepin.com' + url headers = { 'User-Agent': random.choice(self.USER_AGENTS) } response = requests.get(url_1, proxies={'http': random.choice(self.ls)}, headers=headers, timeout=5) ret = random.uniform(0, 2) time.sleep(ret) self.parse_job_details(title, salary, region, degree, experience, name, industry, response.text) except Exception as e: print(e) #解析页面数据,构建字典,生成本地文件。 def parse_job_details(self, title, salary, region, degree, experience, name, industry, text): try: tiems = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) gsgm_url = re.findall('
公司规模:(.*?) ', text) gsdz_url = re.findall('"dqName": "(.*?)",', text) try: gsdz = gsdz_url[0] except Exception: gsdz = "无" try: gsgml = gsgm_url[0] except Exception: gsgml = "无" pattern = re.compile( '(.*?).*?', re.S) text = re.search(pattern, text) detail = re.sub(re.compile('<[^>]+>', re.S), '', text.group(1)) dists = {'公司名称': name, '所属行业': industry, '公司规模': gsgml, '公司地址': region, '工作地点': gsdz, '采集时间': tiems, '工资情况': salary, '学历要求': degree, '工作经验': experience, '招聘岗位': title.strip('招聘'), '职位描述': detail} jsons = json.dumps(dists, ensure_ascii=False, indent=2) hh = str(title.strip('招聘')).split('/') bb = '' for n in hh: bb = n + bb with open('data/{}.json'.format(name.split('/')[0] + '-' + bb.split('/')[0]), 'w', encoding='utf-8') as f: f.write(jsons) li = [name, industry, gsgml, region, gsdz, tiems, salary, degree, experience, title.strip('招聘'), detail] self.count += 1 except Exception as e: print("re parse_job_list error : ", str(e)) #函数执行入口 if (__name__ == "__main__"): mm = Spider() mm.sql()

庞玉栋个人博客、人生苦短-我用Python