当前位置: 首页 > 工具软件 > python-hwdata > 使用案例 >

python 小爱音箱集成_python爬取京东小爱音响评论

叶鸿煊
2023-12-01

importrequestsfrom bs4 importBeautifulSoup as bsimportreimportpandas as pdfrom sqlalchemy importcreate_enginefrom pandas.io.sql importto_sql as pd_sqlimportpymysqlimportrandomimporttime#定义pandas存入mysql函数

def pandas_to_mysql(df_data, table_name, **kwargs):

engine= create_engine('mysql+pymysql://{}:{}@{}:{}/{}?charset={}'.format(kwargs['user'],kwargs['password'],kwargs['host'],kwargs['port'],kwargs['db'],kwargs['charset']))

pd_sql(df_data, table_name, engine, index=False, if_exists='append', chunksize=10000) #if_exists: 'replace', 'append'

engine.dispose()

db_local_hwdata= {'user':'root','password':'8888','port': 3306,'host':'localhost','db': 'hwdata','charset': 'utf8mb4'}#设置代理池

proxies ={'http':'http://120.83.110.65:9999','http':'http://183.146.156.209:9999','http':'http://123.169.34.94:9999','http':'http://117.57.90.141:9999','http':'http://117.69.201.19:9999','http':'http://117.95.192.240:9999','http':'http://117.95.192.240:9999','http':'http://163.204.244.161:9999','http':'http://163.204.246.168:9999','http':'http://171.35.160.62:9999','http':'http://36.22.77.186:9999','http':'http://120.83.107.184:9999',

}

url= 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv33766&productId=5239477&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' #商品链接

headers ={'Referer': 'https://item.jd.com/5239477.html','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}

result= requests.get(url, headers=headers, proxies=proxies)

result_text=result.text

result_text= result_text.replace('\\n', '')

comment_number= re.search('"commentCount":([\d]+),', result_text, re.S).group(1) #所有评论的数量,用于分页,每页评论数量为10

#遍历,10个评论一页

for page_i in range(0, int(int(comment_number)/10)):

time.sleep(random.uniform(0.6, 4))

url= f'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv33766&productId=5239477&score=0&sortType=5&page={page_i}&pageSize=10&isShadowSku=0&fold=1'result= requests.get(url, headers=headers, proxies=proxies)

result_text=result.text

result_text= result_text.replace('\\n', '')

analyse_result= re.findall('"guid":"(.*?)",.*?"content":"(.*?)",.*?"creationTime":"(.*?)".*?"isDelete":(.*?),.*?"isTop":(.*?),.*?"replyCount":(.*?),.*?"score":(.*?),.*?"imageStatus":(.*?),.*?"usefulVoteCount":(.*?),.*?"userClient":(.*?),.*?"imageCount":(.*?),.*?"anonymousFlag":(.*?),.*?"plusAvailable":(.*?),.*?"productColor":"(.*?)".*?"imageIntegral".*?,(.*?)"status".*?,"referenceTime":"(.*?)".*?nickname":"(.*?)".*?"days":(.*?),.*?"afterDays":(.*?)}', result_text, re.S)

final_data=[]for rt inanalyse_result:

person_data=[]for ch_i, character inenumerate(rt):if ch_i == 14 and (character != ''):try:

character= re.search('"content":"(.*?)",', character, re.S).group(1)except:

character= ''

if ch_i == 18 and (int(character)< int(rt[-2])):

character= ''person_data.append(character)

final_data.append(person_data)

dfanalyse_result= pd.DataFrame(final_data, columns=['commentUrl', 'comment', 'commentTime', 'isDelete', 'isTop', 'replyCount', 'stars', 'imageFlag', 'voteCount', 'userClient', 'imageCount', 'anonymousFlag', 'plusFlag', 'productCatergory', 'afterComment', 'purchaseTime', 'userName', 'commentAfterBuyingTime', 'afterCommentAfterBuyingTime']) #将数据转换为DataFrame格式

pandas_to_mysql(dfanalyse_result, 'w20191212jingdong_comments', **db_local_hwdata) #存入mysql

print(f'page_i{page_i} finished!')

 类似资料: