由于工作需要,要抓取谷歌app市场的的icon,花了点时间粗略的看了下python基本搞出来了,如下:
#!/usr/bin/env python
#-*- encoding: utf8 -*-
# author : xxx
# version: 1.1.0
# Date : 2015/09/07 09:30:00
# 功能 : 定期在Google Play抓取游戏软件最新版本信息
# 版本号标签 <div class="content" itemprop="softwareVersion"> 1.0.0 </div>
# 图片标签 <img class="cover-image" src="https://lh3.googleusercontent.com/68uKqs4VBQ5Sl2f7kqGmy1sLYzezmAn_LrV993b4Vw6vn5gAYRk0mGhqC3ZnLnzeU0A=w300-rw" alt="Cover art" aria-hidden="true" itemprop="image">
import time
import urllib2
import HTMLParser
import re
import os
import sys
import xlrd
from GifImagePlugin import getdata
#excel文件 游戏列表文件路径
xls_filename = ur"D:\需抓取icon资源信息.xlsx"
#请求的根url
host = "https://play.google.com/store/apps/details?hl=zh_CN&id="
#本地保存地址
localSavePath = 'D:\\icon\\'
#log文件目录
logPath = r"<span style="font-family: Arial, Helvetica, sans-serif;">D:\\icon\\</span><span style="font-family: Arial, Helvetica, sans-serif;">"</span>
#xls解析
def analyXls(filename):
print "analy excel file start ... "
#app列表model
modelList = [];
data = xlrd.open_workbook(xls_filename)
sheetnames = data.sheet_names()
#sheet数量
count = len(data.sheets())
for sheet_name in sheetnames:
print sheet_name
#单个的工作表
sheet = data.sheet_by_name(sheet_name)
#总行数
rows = sheet.nrows
#总列数
cols = sheet.ncols
#for row in range(20001,22000):
for row in range(3,rows):
if cols < 1 :
break
resId = str(int(sheet.cell_value(row,1))).strip()
resType = str(int(sheet.cell_value(row,2))).strip()
resName = sheet.cell_value(row,3)
pkgName = str(sheet.cell_value(row,4)).strip()
pkgVersion = str(sheet.cell_value(row,5)).strip()
model = XlsModel(resId=resId,resType=resType,resName=resName,pkgName=pkgName,pkgVersion=pkgVersion)
modelList.append(model)
#print len(modelList)
#if row > 20 :
# break
print "analy excel file end ... "
return modelList
#获取网页
def getDoc(url):
try:
response = None
response = urllib2.urlopen(url, timeout=6)
html = response.read()
print "获取应用成功:[Code:%s] %s" %(response.code,url)
return html
except urllib2.HTTPError, e:
print "HTTPError------"
print "获取应用失败:[Code:%s]" % e.code
print url
if str(e.code) != "404" :
raise Exception("非404错误,访问url网络异常",e)
return None
except urllib2.URLError, e:
print "URLError------"
print "获取应用失败:[Code:%s]" % e.reason
print url
#vpn断了 连不上网了
if "timed out" == str(e.reason):
return e.reason
else:
raise Exception("未知错误,访问url网络异常",e)
return None
except Exception, e:
print "Exception------"
print e
raise Exception("非404错误,访问url网络异常",e)
return None
finally:
if response:
response.close()
#版本号
def getVersion(doc):
reg = r'<div class="content" itemprop="softwareVersion">(.+?)</div>'
match = re.search(reg, doc)
if match:
return match.group(1).strip().decode()
return ""
#图片的url
def getIconUrl(doc):
reg = r'<img class="cover-image" src="(.+?)" alt="Cover art" aria-hidden="true" itemprop="image">'
match = re.search(reg, doc)
if match:
return match.group(1).strip()
return None
#下载icon
def downloadImage(url,xlsModel):
print "icon url ...",url
response = urllib2.urlopen(url);
#图片类型 即为后缀
imgType = response.info().getheader("Content-Type")
ext = imgType[imgType.find('/')+1:]
if ext == "jpeg" or ext == "jpg" or ext == "JPEG" or ext == "JPG" :
print "原类型为:" + ext + " 转为:png"
ext = "png"
imgType = "." + ext
filepath = localSavePath + xlsModel.getFilePath()
if not os.path.exists(filepath) :
os.makedirs(filepath)
filename = filepath + os.sep + xlsModel.getFileName() + imgType
#print filename
f = open(filename,'wb+')
cont = response.read()
f.write(cont)
f.close()
print "ok 文件下载成功: %s"%(filename)
#比较版本号大小 a = "v 1.2.7 a" b = "a1.2.18 a"
def eqSize(v1,v2):
'''
1: v1 > v2
2: v2 > v1
3: v1 == v2
4: 无法比较 类似 v1.1 1.2
'''
try:
rs = 0
iv1 = v1.replace(" ", "").lower()
iv2 = v2.replace(" ", "").lower()
if (iv1 == "" and iv2 =="") or (iv1=="." and iv2=="."):
rs = 3
return rs
if iv1 =="" or iv2 =="" or iv1=="." or iv2==".":
rs = 4
return rs
iv1list = []
iv2list = []
reg1 ="(^[A-Za-z.]+)"
reg2 ="(^[0-9.]+)"
while iv1 != "" :
match1 = re.search(reg1, iv1)
if match1 :
m = match1.group(1).strip().decode()
#iv1 = iv1.replace(m, "")
iv1 = iv1[(len(m)):]
m = m.replace(".", "").replace(" ", "")
map = {"k":"match1","v":m}
iv1list.append(map)
else :
match2 = re.search(reg2, iv1)
if match2 :
m = match2.group(1).strip().decode()
#iv1 = iv1.replace(m, "")
iv1 = iv1[(len(m)):]
reg4 ="([.]+$)"
match4 = re.search(reg4, m)
if match4 :
rm = match4.group(0).strip().decode()
m = m[:(len(m)-len(rm))]
reg3 ="([.0]+$)"
match3 = re.search(reg3, m)
while match3 :
rm = match3.group(0).strip().decode()
m = m[:(len(m)-len(rm))]
match3 = re.search(reg3, m)
ivls = m.split(".")
for i in ivls :
map = {"k":"match2","v":i}
iv1list.append(map)
while iv2 != "" :
match1 = re.search(reg1, iv2)
if match1 :
m = match1.group(1).strip().decode()
#iv2 = iv2.replace(m, "")
iv2 = iv2[(len(m)):]
m = m.replace(".", "").replace(" ", "")
map = {"k":"match1","v":m}
iv2list.append(map)
else :
match2 = re.search(reg2, iv2)
if match2 :
m = match2.group(1).strip().decode()
#iv2 = iv2.replace(m, "")
iv2 = iv2[(len(m)):]
reg4 ="([.]+$)"
match4 = re.search(reg4, m)
if match4 :
rm = match4.group(0).strip().decode()
m = m[:(len(m)-len(rm))]
reg3 ="([.0]+$)"
match3 = re.search(reg3, m)
while match3 :
rm = match3.group(0).strip().decode()
m = m[:(len(m)-len(rm))]
match3 = re.search(reg3, m)
iv2s = m.split(".")
for i in iv2s :
map = {"k":"match2","v":i}
iv2list.append(map)
iv1list_len = len(iv1list)
iv2list_len = len(iv2list)
minlen = 0
if iv1list_len > iv2list_len :
minlen = iv2list_len
rs = 1
elif iv1list_len < iv2list_len :
minlen = iv1list_len
rs = 2
else:
minlen = iv1list_len
rs = 3
for i in range(0,minlen):
iv1v_k = iv1list[i]["k"]
iv1v_v = iv1list[i]["v"]
iv2v_k = iv2list[i]["k"]
iv2v_v = iv2list[i]["v"]
if iv1v_k != iv2v_k :
rs = 4
break
if iv1v_k == "match2" and iv2v_k == "match2" :
#0开头的比较
reg0 ="(^[0])"
match1 = re.search(reg0, iv1v_v)
match2 = re.search(reg0, iv2v_v)
len1 = len(iv1v_v)
len2 = len(iv2v_v)
if match1 and len1>1:
iv1v_v= "0." + iv1v_v[1:]
if match2 and len2>1:
iv2v_v= "0." + iv2v_v[1:]
#0开头的比较
#位数不足的 补位
reg1 ="([.])"
match1 = re.search(reg1, iv1v_v)
match2 = re.search(reg1, iv2v_v)
if match1 or match2:
pass
else :
if len1 > len2 :
iv2v_v = iv2v_v + ("0" * (len1-len2))
if len2 > len1 :
iv1v_v = iv1v_v + ("0" * (len2-len1))
#位数不足的 补位
if float(iv1v_v) > float(iv2v_v) :
rs = 1
break
elif float(iv1v_v) < float(iv2v_v) :
rs = 2
break
else:
continue
else:
if iv1v_v > iv2v_v :
rs = 1
break
elif iv1v_v < iv2v_v :
rs = 2
break
else:
continue
return rs
except Exception, e:
print "比较大小Exception------"
print e
rs = 4
return rs
def writeLog(modelList):
'''
0 : 未处理
1 :谷歌市场找不到的应用
2 :请求版本 低于谷歌市场版本
3 : 请求版本 高于谷歌市场版本
4 : 更新成功
5 : 异常、错误失败的app
6 : 版本无法比较
'''
log = Log(Log.allApp)
print
log.writeLog("--------谷歌市场找不到的应用-------")
notfoundApp = [i for i in modelList if i.status == 1]
log.writeLog("总数: " + str(len(notfoundApp)))
for model in notfoundApp :
log.writeLog(model.resId + " " + model.pkgName )
print
log.writeLog("--------请求版本,低于谷歌市场版本-------" )
lowApp = [i for i in modelList if i.status == 2]
log.writeLog("总数: " + str(len(lowApp)) )
for model in lowApp :
log.writeLog(model.resId + " " + model.pkgName +" "+ model.pkgVersion +" "+ model.google_pkgVersion )
print
log.writeLog("--------请求版本,高于谷歌市场版本-------" )
highApp = [i for i in modelList if i.status == 3]
log.writeLog("总数: " + str(len(highApp)) )
for model in highApp :
log.writeLog(model.resId + " " + model.pkgName +" "+ model.pkgVersion +" "+ model.google_pkgVersion )
print
log.writeLog("--------更新成功-------" )
successApp = [i for i in modelList if i.status == 4]
log.writeLog("总数: " + str(len(successApp)) )
for model in successApp :
log.writeLog(model.resId + " " + model.pkgName +" "+ model.pkgVersion +" "+ model.google_pkgVersion)
print
log.writeLog("--------异常、错误失败的应用-------" )
errorApp = [i for i in modelList if i.status == 5]
log.writeLog("总数: " + str(len(errorApp)) )
for model in errorApp :
log.writeLog(model.resId + " " + model.pkgName )
print
log.writeLog("--------版本无法比较的应用-------" )
errorApp = [i for i in modelList if i.status == 6]
log.writeLog("总数: " + str(len(errorApp)) )
for model in errorApp :
log.writeLog(model.resId + " " + model.pkgName +" "+ model.pkgVersion +" "+ model.google_pkgVersion)
log.close()
#日志
class Log :
notfoundApp = "notfoundApp"
lowApp = "lowApp"
highApp = "highApp"
successApp = "successApp"
errorApp = "errorApp"
errorVersionApp = "errorVersionApp"
allApp = "allApp"
def __init__(self,name):
if not os.path.exists(logPath) :
os.makedirs(logPath)
now = time.strftime('%Y%m%d%H%M%S');
filename = logPath + os.sep + name + now + ".log"
f = open(filename,"wb+")
self.f = f
def writeLog(self,str):
print str
self.f.write(str + os.linesep)
self.f.flush()
def writeLog2(self,model):
if model.status == 1:
self.writeLog3(model.status,model.resId + " " + model.pkgName )
elif model.status == 2:
self.writeLog3(model.status,model.resId + " " + model.pkgName +" "+ model.pkgVersion +" < "+ model.google_pkgVersion )
elif model.status == 3:
self.writeLog3(model.status,model.resId + " " + model.pkgName +" "+ model.pkgVersion +" > "+ model.google_pkgVersion )
elif model.status == 4:
self.writeLog3(model.status,model.resId + " " + model.pkgName +" "+ model.pkgVersion +" > "+ model.google_pkgVersion)
elif model.status == 5:
self.writeLog3(model.status,model.resId + " " + model.pkgName +" "+ str(model.ex))
elif model.status == 6:
self.writeLog3(model.status,model.resId + " " + model.pkgName +" "+ model.pkgVersion +" != "+ model.google_pkgVersion )
def writeLog3(self,status,str):
if status == 1:
print "谷歌市场找不到的应用 :" + str
self.f.write(str + os.linesep)
self.f.flush()
elif status == 2:
print "请求版本 低于谷歌市场版本:" + str
self.f.write(str + os.linesep)
self.f.flush()
elif status == 3:
print "请求版本 高于谷歌市场版本 :" + str
self.f.write(str + os.linesep)
self.f.flush()
elif status == 4:
print "更新成功 :" + str
self.f.write(str + os.linesep)
self.f.flush()
elif status == 5:
print " 异常、错误失败的app :" + str
self.f.write(str + os.linesep)
self.f.flush()
elif status == 6:
print "版本无法比较 :" + str
self.f.write(str + os.linesep)
self.f.flush()
def close(self):
self.f.close()
#excel模型
class XlsModel :
def __init__(self,resId,resType,resName,pkgName,pkgVersion):
self.resId = resId
self.resType = resType
self.resName = resName
self.pkgName = pkgName
self.pkgVersion = pkgVersion
'''
0 : 未处理
1 :谷歌市场找不到的应用
2 :请求版本 低于谷歌市场版本
3 : 请求版本 高于谷歌市场版本
4 : 更新成功
5 : 异常 错误失败的app
6 : 版本无法比较
10 : 未处理
'''
self.status = 10
self.google_pkgVersion = ""
def __str__(self):
return "resId" + " : " + self.resId + " " \
+ "resType" + " : " + self.resType + " " \
+ "resName" + " : " + self.resName + " " \
+ "pkgName" + " : " + self.pkgName + " " \
+ "pkgVersion" + " : " + self.pkgVersion
#计算文件名
def getFileName(self):
return "icon-google"
#计算文件名
def getFilePath(self):
#resId%1000/resType-resId/icon-google.png
path1 = int(self.resId) % 1000
path2 = self.resType + "-" + self.resId
return str(path1) + os.sep + path2
#网络访问路径名
def getNetworkPath(self):
return host + self.pkgName
#执行爬取数据
def main():
modelList = analyXls(xls_filename)
models = modelList[:]
notfoundApp = Log(Log.notfoundApp)
lowApp= Log(Log.lowApp)
highApp=Log(Log.highApp)
successApp=Log(Log.successApp)
errorApp=Log(Log.errorApp)
errorVersionApp = Log(Log.errorVersionApp)
inum = 0
slen =len(modelList)
#循环
while True :
inum = inum + 1
count = len(models)
exeCount = 0
if count == 0 :
break
for model in models :
exeCount = exeCount + 1
try:
point = time.time()
print
print
print
#print "------->>start fetch document"
doc = getDoc(model.getNetworkPath())
while str(doc) == "timed out" :
doc = getDoc(model.getNetworkPath())
print "vpn链接断开 或其他问题导致连不上谷歌市场... %s %s" %(model.resId,model.pkgName)
if doc is None:
model.status = 1
notfoundApp.writeLog2(model)
continue
else:
#print "------->>start analysis document"
model.google_pkgVersion = getVersion(doc)
if model.google_pkgVersion != model.pkgVersion :
#正则匹配无法识别比较的版本号
p = re.compile('^[A-Za-z0-9. ]+$',re.S)
match1 = p.match(model.google_pkgVersion)
match2 = p.match(model.pkgVersion)
if match1 and match2 :
#比较版本大小
rs = eqSize(model.google_pkgVersion, model.pkgVersion)
if rs == 1 :
model.status = 2
lowApp.writeLog2(model)
elif rs == 2 :
model.status = 3
highApp.writeLog2(model)
elif rs == 3 :
downloadImage(getIconUrl(doc),model)
model.status = 4
successApp.writeLog2(model)
elif rs == 4 :
model.status = 6
errorVersionApp.writeLog2(model)
else:
model.status = 6
errorVersionApp.writeLog2(model)
else :
downloadImage(getIconUrl(doc),model)
model.status = 4
successApp.writeLog2(model)
except Exception,ex:
model.status = 5
model.ex = ex
errorApp.writeLog2(model)
print "gameid:%s %s used %s sec" % (model.resName,model.pkgName, time.time()-point)
print "num %s count: %s exeCount: %s 剩余:%s " %(inum,count,exeCount,(count-exeCount))
models = [i for i in modelList if i.status == 0]
models = [i for i in modelList if i.status == 5]
# if slen > len(models) or len(models) == len(modelList) :
# slen = len(models)
# writeLog(modelList)
notfoundApp.close()
lowApp.close()
highApp.close()
successApp.close()
errorApp.close()
errorVersionApp.close()
writeLog(modelList)
if __name__ == "__main__":
print '>>BEGIN<<'
start = time.time()
print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start))
main()
end = time.time()
print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end))
print "#########get_icon_for_googleapp_by_id_and_version_0.py over. Used %s" % (end - start)
print '>>END<<'