庞玉栋个人博客、人生苦短-我用Python

智联招聘爬虫+email邮件发送

发布时间:6年前热度: 1030 ℃评论数:
# -*- coding: utf-8 -*-
import requests
import re
import time
import json
import pymysql
import random
import smtplib
import email.mime.multipart
import email.mime.text


class Spider():

    def __init__(self):
        self.ls = []
        self.count = 0
        self.USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        ]

	#程序执行完毕后发送邮件函数
    def emial(self):
        try:
            subject = 'python执行日志'
            content = "可用{}个代理ip,所有url已经爬取完毕".format(len(self.ls))
            msg = email.mime.multipart.MIMEMultipart()
            msg['from'] = 'xxxxx@qq.com'
            msg['to'] = 'xxxx@qq.com'
            msg['subject'] = subject
            content = content
            txt = email.mime.text.MIMEText(content, 'plain', 'utf-8')
            msg.attach(txt)
            smtpHost = 'smtp.qq.com'
            smtpPort = '25'
            sslPort = '465'

            smtp = smtplib.SMTP_SSL(smtpHost, sslPort)
            smtp.ehlo()
            smtp.login('xxxxx@qq.com', 'xxxxx')
            smtp.sendmail('xxxx@qq.com', 'xxxxx@qq.com', str(msg))
            print("发送成功!")
            smtp.quit()
        except Exception as e:
            print(e)

	#从数据库中查询代理函数
    def sql(self):
        try:
            db = pymysql.connect(host="xxxxxxxxxx", user="xxxxxx", password="xxxxxx", db="xxxxx", port=3306,
                                 charset="utf8")
            cur = db.cursor()
            sql = "SELECT * FROM proxy WHERE type=0"
            cur.execute(sql)  # 执行sql语句
            results = cur.fetchall()  # 获取查询的所有记录
            for row in results:
                head = row[0]
                ip = row[1]
                poxy = row[2]
                if str(head) == 'HTTP':
                    self.ls.append(ip + ':' + poxy)
            print('可用{}个代理ip,爬虫开始'.format(len(self.ls)))
            self.crawler_data()
        except Exception as e:
            print(e)

	#爬虫入口,负责构建url
    def crawler_data(self):
        i = 0
        for n in ['投资经理', '土建工程师', '采购', '销售', '运营', 'java', '产品经理', '人力资源', '总经理', 'Python', 'iOS', 'PHP', 'Android',
                  'UI设计师', '测试工程师', '自动化测试', '功能测试', '性能测试', '测试开发', '硬件测试', '运维工程师', '系统工程师', '网络工程师', '运维开发', 'DBA',
                  '技术经理', '架构师', '技术总监', 'CTO', '技术合伙人', '运维总监', '安全专家', '项目总监',
                  'Html5', '产品经理', '网页产品经理', '移动产品经理', '产品助理', '数据产品经理', '游戏策划', '电商产品经理']:
            try:
                while True:
                    url = 'https://www.liepin.com/zhaopin/?industryType=&jobKind=&sortFlag=15°radeFlag=0&industries=&salary=&compscale=&key={}&clean_condition=&headckid=4a4adb68b22970bd&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~fA9rXquZc5IkJpXC-Ycixw&d_headId=62ac45351cdd7a103ac7d50e1142b2a0&d_ckId=62ac45351cdd7a103ac7d50e1142b2a0&d_sfrom=search_fp&d_curPage=0&curPage={}'.format(
                        n, random.randint(1, 20))
                    self.request_job_list(url)
                    print(url)
                    i = i + 1
                    if i == 10:
                        i = 0
                        break
            except Exception as e:
                print(e)
        self.emial()

	#接受传递来的url 进行解析页面数据,传递数据给下一个函数
    def request_job_list(self, url):
        try:
            print(url)
            headers = {
                'Referer': 'https://www.liepin.com/',
                'User-Agent': random.choice(self.USER_AGENTS)
            }
            reponse = requests.get(url, proxies={'http': random.choice(self.ls)}, headers=headers, timeout=5)
            ret = random.uniform(0, 2)
            time.sleep(ret)
            pattern = re.compile('
' '.*?.*?.*?

.*?>(.*?).*?

.*?target="_blank">(.*?).*?', re.S) datas = re.findall(pattern, reponse.text) for data in datas: title = data[0] href = data[1] result = data[2].split('_') salary = result[0] region = result[1] degree = result[2] experience = result[3] name = data[3] industry = data[4] href = '/' + href.split('/')[-2] + '/' + href.split('/')[-1] self.request_job_details(title, salary, region, degree, experience, name, industry, href) if reponse.status_code != 200: return 0 except Exception as e: print('request_job_list error : {}'.format(e)) #通过传递来的url,解析url页面,提取数据 def request_job_details(self, title, salary, region, degree, experience, name, industry, url): try: url_1 = 'https://www.liepin.com' + url headers = { 'User-Agent': random.choice(self.USER_AGENTS) } response = requests.get(url_1, proxies={'http': random.choice(self.ls)}, headers=headers, timeout=5) ret = random.uniform(0, 2) time.sleep(ret) self.parse_job_details(title, salary, region, degree, experience, name, industry, response.text) except Exception as e: print(e) #解析页面数据,构建字典,生成本地文件。 def parse_job_details(self, title, salary, region, degree, experience, name, industry, text): try: tiems = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) gsgm_url = re.findall('

  • 公司规模:(.*?)
  • ', text) gsdz_url = re.findall('"dqName": "(.*?)",', text) try: gsdz = gsdz_url[0] except Exception: gsdz = "无" try: gsgml = gsgm_url[0] except Exception: gsgml = "无" pattern = re.compile( '
    (.*?)
    .*?
    ', re.S) text = re.search(pattern, text) detail = re.sub(re.compile('<[^>]+>', re.S), '', text.group(1)) dists = {'公司名称': name, '所属行业': industry, '公司规模': gsgml, '公司地址': region, '工作地点': gsdz, '采集时间': tiems, '工资情况': salary, '学历要求': degree, '工作经验': experience, '招聘岗位': title.strip('招聘'), '职位描述': detail} jsons = json.dumps(dists, ensure_ascii=False, indent=2) hh = str(title.strip('招聘')).split('/') bb = '' for n in hh: bb = n + bb with open('data/{}.json'.format(name.split('/')[0] + '-' + bb.split('/')[0]), 'w', encoding='utf-8') as f: f.write(jsons) li = [name, industry, gsgml, region, gsdz, tiems, salary, degree, experience, title.strip('招聘'), detail] self.count += 1 except Exception as e: print("re parse_job_list error : ", str(e)) #函数执行入口 if (__name__ == "__main__"): mm = Spider() mm.sql()

    智联招聘,邮件,发送

    手机扫码访问