Django 用户认证 用户 邮箱登录 邮箱注册 ORM or,and,not form.py FORM ModelForm Paginator 分页 HTMl JQuery 定位元素 ajax django切片 restfulapi 跨域 Ubantu Python Mysql Scrapy 爬虫 导出 Python读写 Pycharm 破解 session re sqlit3 生成式 其他 Prism 富文本 CSS Nginx 部署 请求头 抓包 协议 selenium Ubuntu 宝塔 AI Comfy-ui ollama dify open-webui Git docker
51jobSpider
张建行 2022年6月20日 10:26 90 文章标签: Scrapy 爬虫 导出 re 请求头 抓包

51jobSpider

from fake_useragent import UserAgent
from urllib import parse
import requests, re, json, csv


class Job51Spider(object):
 def __init__(self):
 # 清空csv文件内容, 写入表头
 with open('51job.csv', 'w', encoding='utf-8', newline='') as f:
 reader = csv.writer(f)
 reader.writerow(['职位名称', '公司名称', '薪资', '工作地点', '职位发布日期', '公司类型', '公司福利', '职位技能', '公司业务', '公司规模'])
 self.headers = {
 'User-Agent': UserAgent().random,
 "Host": "search.51job.com",

 "Accept": "application / json, text / javascript, * / *; q = 0.01",
 'Referer': 'https://search.51job.com/list/020000,000000,0000,00,9,99,%25E6%2591%2584%25E5%25BD%25B1%25E5%25B8%2588,2,9.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=',
 'Cookie': "_uab_collina=166011585344663273017034; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60020000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60020000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C9%E3%D3%B0%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; slife=lastvisit%3D020000%26%7C%26; partner=cn_bing_com; privacy=1660290871; guid=6360a8ae5a33b6fde8aa3a55c6a79fd4; acw_sc__v2=62f6073ce84e990f8d0a360f4118be99301d0ac7; SL_G_WPT_TO=zh-CN; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%226360a8ae5a33b6fde8aa3a55c6a79fd4%22%2C%22first_id%22%3A%22182869d9d3e16c-07d5ca338d1937-45647f50-1327104-182869d9d3fb12%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyODY5ZDlkM2UxNmMtMDdkNWNhMzM4ZDE5MzctNDU2NDdmNTAtMTMyNzEwNC0xODI4NjlkOWQzZmIxMiIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjYzNjBhOGFlNWEzM2I2ZmRlOGFhM2E1NWM2YTc5ZmQ0In0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%226360a8ae5a33b6fde8aa3a55c6a79fd4%22%7D%2C%22%24device_id%22%3A%22182869d9d3e16c-07d5ca338d1937-45647f50-1327104-182869d9d3fb12%22%7D; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; acw_tc=ac11000116602929812648254e00e1d11f6240b675be3361abad3b7b7d4f96; ssxmod_itna=iqmxBD0DcDg037Ke0LxYIEP7wKr7ieqDyG07QEAmx0vc3GzDAxn40iDt=rNmQrBrx4qw7oebIY87OiuDH3Ox2RTejAl84GLDmKDy4=xPGG0xBYDQxAYDGDDPDogPD1D3qDkD7h6CMy1qGWDm4kDWPDYxDrjOKDRxi7DDvQCx07DQ5k8DekoRawcagCimAPF0KD91oDsE0fY0FmjS34MpYEI3YdIx0kl40Oya5szaoDUlFsBoBNNoReWC0ei0Rxi7Dxw7DY8m2D8mG9qADxO70Dvlxq8vxn6wDDioWPOYD===; ssxmod_itna2=iqmxBD0DcDg037Ke0LxYIEP7wKr7ieqDyG07QEADn93uYqDsqq3DLA18V3N7qgLiQYF9E2Kx=i+aeh=dRG7ugGpwiFqgIO9DnR=4YFFj0cNw+gCwP4Vn7o6y+RC96=M75LbMgdB6146kSfYq7bUvYYtMjEfqaTh52giKyfEviGR+iMbdRjU+GYRdoi2d=lbtre=iTWTkWa=SFaTqlEn2I8bESb3OU+01B3ioru5cGLcrcawPvqamz3bcG=tR/lEvF+8n4zUBUTKXp98CFxG24gh74LrKtboYtbZYoGjy2r/+jDavWA37rDPCPAYiIWmVoprY0LQ+xDGcDG7YiDD="
 }

 def get_page_info(self, position):
 for i in range(1, 2):
 self.url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,{},2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(
 position, i)
 res = requests.get(self.url, headers=self.headers)
 re_obj = re.compile(r'window.__SEARCH_RESULT__ = (.*?)</script>', re.S)
 data = json.loads(re.search(re_obj, res.text).group(1))

 for i in data['engine_jds']:
 # 职位名称
 job_name = i['job_name']
 # 公司名称
 company_name = i['company_name']
 # 薪资
 providesalary_text = i['providesalary_text']
 # 工作地点
 workarea_text = i['workarea_text']
 # 职位发布日期
 issuedate = i['issuedate']
 # 公司类型
 companytype_text = i['companytype_text']
 # 公司福利
 jobwelf = i['jobwelf']
 # 职位技能
 attribute_text = i['attribute_text']
 # 公司业务
 companyind_text = i['companyind_text']
 # 公司规模
 companysize_text = i['companysize_text']
 ls = [job_name, company_name, providesalary_text, workarea_text, issuedate, companytype_text, jobwelf,
 attribute_text, companyind_text, companysize_text]
 with open('51job.csv', 'a', encoding='utf-8', newline='') as f:
 reader = csv.writer(f)
 reader.writerow(ls)


s = Job51Spider()
name = input('请输入要搜索的职业:')
position = '%25'.join(parse.quote(name).split('%'))
s.get_page_info(position)