智联招聘爬虫+email邮件发送

发布时间：6年前热度： 1030 ℃评论数：
# -*- coding: utf-8 -*-
import requests
import re
import time
import json
import pymysql
import random
import smtplib
import email.mime.multipart
import email.mime.text


class Spider():

    def __init__(self):
        self.ls = []
        self.count = 0
        self.USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        ]

	#程序执行完毕后发送邮件函数
    def emial(self):
        try:
            subject = 'python执行日志'
            content = "可用{}个代理ip，所有url已经爬取完毕".format(len(self.ls))
            msg = email.mime.multipart.MIMEMultipart()
            msg['from'] = 'xxxxx@qq.com'
            msg['to'] = 'xxxx@qq.com'
            msg['subject'] = subject
            content = content
            txt = email.mime.text.MIMEText(content, 'plain', 'utf-8')
            msg.attach(txt)
            smtpHost = 'smtp.qq.com'
            smtpPort = '25'
            sslPort = '465'

            smtp = smtplib.SMTP_SSL(smtpHost, sslPort)
            smtp.ehlo()
            smtp.login('xxxxx@qq.com', 'xxxxx')
            smtp.sendmail('xxxx@qq.com', 'xxxxx@qq.com', str(msg))
            print("发送成功！")
            smtp.quit()
        except Exception as e:
            print(e)

	#从数据库中查询代理函数
    def sql(self):
        try:
            db = pymysql.connect(host="xxxxxxxxxx", user="xxxxxx", password="xxxxxx", db="xxxxx", port=3306,
                                 charset="utf8")
            cur = db.cursor()
            sql = "SELECT * FROM proxy WHERE type=0"
            cur.execute(sql)  # 执行sql语句
            results = cur.fetchall()  # 获取查询的所有记录
            for row in results:
                head = row[0]
                ip = row[1]
                poxy = row[2]
                if str(head) == 'HTTP':
                    self.ls.append(ip + ':' + poxy)
            print('可用{}个代理ip，爬虫开始'.format(len(self.ls)))
            self.crawler_data()
        except Exception as e:
            print(e)

	#爬虫入口，负责构建url
    def crawler_data(self):
        i = 0
        for n in ['投资经理', '土建工程师', '采购', '销售', '运营', 'java', '产品经理', '人力资源', '总经理', 'Python', 'iOS', 'PHP', 'Android',
                  'UI设计师', '测试工程师', '自动化测试', '功能测试', '性能测试', '测试开发', '硬件测试', '运维工程师', '系统工程师', '网络工程师', '运维开发', 'DBA',
                  '技术经理', '架构师', '技术总监', 'CTO', '技术合伙人', '运维总监', '安全专家', '项目总监',
                  'Html5', '产品经理', '网页产品经理', '移动产品经理', '产品助理', '数据产品经理', '游戏策划', '电商产品经理']:
            try:
                while True:
                    url = 'https://www.liepin.com/zhaopin/?industryType=&jobKind=&sortFlag=15°radeFlag=0&industries=&salary=&compscale=&key={}&clean_condition=&headckid=4a4adb68b22970bd&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~fA9rXquZc5IkJpXC-Ycixw&d_headId=62ac45351cdd7a103ac7d50e1142b2a0&d_ckId=62ac45351cdd7a103ac7d50e1142b2a0&d_sfrom=search_fp&d_curPage=0&curPage={}'.format(
                        n, random.randint(1, 20))
                    self.request_job_list(url)
                    print(url)
                    i = i + 1
                    if i == 10:
                        i = 0
                        break
            except Exception as e:
                print(e)
        self.emial()

	#接受传递来的url 进行解析页面数据，传递数据给下一个函数
    def request_job_list(self, url):
        try:
            print(url)
            headers = {
                'Referer': 'https://www.liepin.com/',
                'User-Agent': random.choice(self.USER_AGENTS)
            }
            reponse = requests.get(url, proxies={'http': random.choice(self.ls)}, headers=headers, timeout=5)
            ret = random.uniform(0, 2)
            time.sleep(ret)
            pattern = re.compile(''
                                 '.*?.*?.*?.*?>(.*?).*?
.*?target="_blank">(.*?).*?',
                                 re.S)
            datas = re.findall(pattern, reponse.text)
            for data in datas:
                title = data[0]
                href = data[1]
                result = data[2].split('_')
                salary = result[0]
                region = result[1]
                degree = result[2]
                experience = result[3]
                name = data[3]
                industry = data[4]
                href = '/' + href.split('/')[-2] + '/' + href.split('/')[-1]
                self.request_job_details(title, salary, region, degree, experience, name, industry, href)
            if reponse.status_code != 200:
                return 0
        except Exception as e:
            print('request_job_list error : {}'.format(e))

	#通过传递来的url，解析url页面，提取数据
    def request_job_details(self, title, salary, region, degree,
                            experience, name, industry, url):
        try:
            url_1 = 'https://www.liepin.com' + url
            headers = {
                'User-Agent': random.choice(self.USER_AGENTS)
            }
            response = requests.get(url_1, proxies={'http': random.choice(self.ls)}, headers=headers, timeout=5)
            ret = random.uniform(0, 2)
            time.sleep(ret)
            self.parse_job_details(title, salary, region, degree, experience, name, industry, response.text)
        except Exception as e:
            print(e)

	#解析页面数据，构建字典，生成本地文件。
    def parse_job_details(self, title, salary, region, degree,
                          experience, name, industry, text):
        try:
            tiems = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            gsgm_url = re.findall('
公司规模：(.*?)', text)
            gsdz_url = re.findall('"dqName": "(.*?)",', text)
            try:
                gsdz = gsdz_url[0]
            except Exception:
                gsdz = "无"
            try:
                gsgml = gsgm_url[0]
            except Exception:
                gsgml = "无"
            pattern = re.compile(
                '(.*?)
.*?', re.S)
            text = re.search(pattern, text)
            detail = re.sub(re.compile('<[^>]+>', re.S), '', text.group(1))
            dists = {'公司名称': name, '所属行业': industry, '公司规模': gsgml, '公司地址': region, '工作地点': gsdz, '采集时间': tiems,
                     '工资情况': salary, '学历要求': degree, '工作经验': experience, '招聘岗位': title.strip('招聘'), '职位描述': detail}
            jsons = json.dumps(dists, ensure_ascii=False, indent=2)
            hh = str(title.strip('招聘')).split('/')
            bb = ''
            for n in hh:
                bb = n + bb
            with open('data/{}.json'.format(name.split('/')[0] + '-' + bb.split('/')[0]), 'w', encoding='utf-8') as f:
                f.write(jsons)
            li = [name, industry, gsgml, region, gsdz, tiems, salary, degree, experience, title.strip('招聘'), detail]
            self.count += 1
        except Exception as e:
            print("re parse_job_list error : ", str(e))


#函数执行入口
if (__name__ == "__main__"):
    mm = Spider()
    mm.sql()
智联招聘,邮件,发送
上一篇：flask传递列表等到jinja2模版的时候出现'；
下一篇：Socket向服务端发送数据（Python）
栏目导航

猎聘网站爬虫（代理池+邮件提醒+写入flume服务端） 3265 ℃
Matplotlib 雷达图详解 2348 ℃
人生苦短,我用python “Life is short. You need Python.” 2101 ℃
在Hadoop HDFS NameNode web端中下载文件会重定向到localhost:50075 1778 ℃
爬取斗图网表情包 1725 ℃
MongoDB笔记 1654 ℃
Matplotlib可视化库的简单使用 1602 ℃
python操作mongo 1596 ℃
Python---命名空间与作用域 1451 ℃
Collections 内建集合模块 1432 ℃
智联招聘爬虫+email邮件发送

栏目导航

相关文章

手机扫码访问