当前位置: 首页 > 工具软件 > 橘子 > 使用案例 >

python 爬取 IT 桔子网

甄云
2023-12-01

python 爬取 IT 桔子网

完整代码下载:https://github.com/tanjunchen/SpiderProject/tree/master/ITOrange

import requests
import re
import pymongo
import random
import time
import json
import random
import numpy as np
import csv
import pandas as pd
from fake_useragent import UserAgent
import socket  # 断线重试
from urllib.parse import urlencode

# 随机ua
ua = UserAgent()

client = pymongo.MongoClient('localhost', 27017)
# 获得数据库
db = client.ITJUZI
mongodb_collection_company = db.itjuzi_company


class ITJUZI(object):
    def __init__(self):
        self.headers = {
            'User-Agent': ua.random,
            'X-Requested-With': 'XMLHttpRequest',
            # 主页cookie
            'Cookie': '76b20f6015442399597225100e18094750f575673b045da6ec0b77984422f6',
        }
        self.url = 'https://www.itjuzi.com/api/companys'  # company
        self.session = requests.Session()

    def get_table(self, page):

        company_payload = {"pagetotal": 121292, "total": 0, "per_page": 30, "scope": "", "sub_scope": "",
                           "round": "", "location": "", "prov": "", "city": "", "status": "", "sort": "",
                           "selected": ""}
        retrytimes = 3
        while retrytimes:
            try:
                response = self.session.get(
                    self.url, params=company_payload, headers=self.headers, timeout=(5, 20)).json()
                print(response)
                self.save_to_mongo(response)
                break
            except socket.timeout:
                print('下载第{}页,第{}次网页请求超时'.format(page, retrytimes))
                retrytimes -= 1

    def save_to_mongo(self, response):
        try:

            data = response['data']['data']
            df = pd.DataFrame(data)
            table = json.loads(df.T.to_json()).values()
            if mongodb_collection_company.insert_many(table):  # investevent
                # if mongo_collection2.insert_many(table):    # company
                # if mongo_collection3.insert_many(table):    # investment
                # if mongo_collection4.insert_many(table):    # horse
                print('存储到mongodb成功')
                sleep = np.random.randint(3, 7)
                time.sleep(sleep)
        except Exception as e:
            print('存储到mongodb失败', e)

    def spider_itjuzi(self, start_page, end_page):
        for page in range(start_page, end_page):
            print('下载第%s页:' % page)
            self.get_table(page)

        print('下载完成')


if __name__ == '__main__':
    spider = ITJUZI()
    spider.spider_itjuzi(398, 4045)
 类似资料: