import sys
import requests
import re
import time
import codecs
import json
import xml.etree.ElementTree as ET
from datetime import datetime, date, time, timedelta
from bs4 import BeautifulSoup
from xml.dom import minidom
from xpinyin import Pinyin
if sys.version_info[0] == 2:
from urllib2 import urlopen # Python 2
else:
from urllib.request import urlopen # Python3
def get_week_backup():
now=datetime.now()
week=now.strftime('%w')
if int(week) == 7:
w=[week,str(1)]
else:
w=[week,str(int(week)+1)]
return w
def get_week():
now=datetime.now()
week=now.strftime('%w')
wd=int(week)
w=[]
for i in range(wd,8):
w.append(str(i))
return w
def saveXML(root, filename, indent="\t", newl="\n", encoding="utf-8"):
rawText = ET.tostring(root)
dom = minidom.parseString(rawText)
with codecs.open(filename, 'w', 'utf-8') as f:
#writer = codecs.lookup('utf-8')[3](f)
dom.writexml(f, "", indent, newl, encoding)
def ch_dict():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
'Connection': 'keep-alive', 'Cache-Control': 'no-cache'}
url="https://www.tianmaotv.cn/tv/"
ch_list=["type_101","type_102","tv_453","tv_526"]
dict_url={}
dict_name={}
p=Pinyin()
tmp_dic={}
for ch in ch_list:
r = requests.get(url+ch, headers=headers)
r.encoding=r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
list_program_div = soup.find(
name='dl', attrs={"class": "tv-station-list"}).find_all('a')
for tagprogram in list_program_div:
r=re.compile(r"\/jmb")
if r.match(tagprogram["href"]):
dict_url[tagprogram.string]=tagprogram["href"]
tmp=p.get_initials(tagprogram.string, u'')
#flages=tmp is tmp_dic
if tmp in tmp_dic:
tmp_dic[tmp+"1"]=tagprogram.string
else:
tmp_dic[tmp]=tagprogram.string
'''
name_list_temp=list(dict_name.values())
flags=tagprogram.string in dict_name
if not flags:
dict_name[tagprogram.string]=p.get_initials(tagprogram.string, u'')
for ss in name_list_temp:
if tmp == ss:
print(tmp,tmp+"1",tagprogram.string)
dict_name[tagprogram.string]=p.get_initials(tagprogram.string, u'')+"1"
#print(dict_name)'''
dict_name={v:k for k, v in tmp_dic.items()}
return dict_url,dict_name
def get_channel_programme(url_name_py,real_rul):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0',
'Connection': 'keep-alive', 'Cache-Control': 'no-cache'}
r = requests.get("https://www.tianmaotv.cn"+real_rul, headers=headers)
r.encoding=r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
#print(soup)
timestart={}
timestop={}
channel_title={}
list_program_div = soup.find(name='dl', attrs={"class": "program-list"}).find_all('dd')
#print(list_program_div)
for i in list_program_div:
times=i.time.string.strip()
times=datetime.strptime("2019-"+times,'%Y-%m-%d %H:%M')
timestart[times.strftime("%Y%m%d%H%M%S")]=i.contents[2].strip()
return timestart
aa=ch_dict()
channel_url=aa[0]
channel_py=aa[1]
'''
with open('e:\\python\\tianmaotv\\data.json', 'w',encoding='utf-8') as f:
json.dump(aa, f,sort_keys=False, indent=4, separators=(',',':'),ensure_ascii=False)
print(channel_py.items)
print(channel_url.items)
'''
new_root = ET.Element('tv')
new_root.set("generator-info-name", "3mile")
new_root.set("generator-info-url", "https://3mile.top")
new_root.tail="\n"
for k,v in channel_py.items():
child = ET.SubElement(new_root,"channel")
#child.tail="\n"
child.set("id",v)
child_name=ET.SubElement(child,"display-name")
#child.tail="\n"
child_name.set("lang","zh")
child_name.text=k
child_url=ET.SubElement(child,'url')
#child_url.tail='\n'
child_url.text="https://tianmao.tv"+channel_url[k]
#ET.dump(new_root)
wday=get_week()
for k,v in channel_url.items():
programme={}
print("正在获取 "+k+" EPG信息")
for i in wday:
url=re.sub(r'\/, "", v)+"_w"+i+"/"
progr=get_channel_programme(channel_py[k],url)
programme.update(progr)
tmp=list(programme.items())
list_time=list(programme.keys())
list_title=list(programme.values())
for t in range(len(list_time)):
tit=list_title[t]
begin=list_time[t]
if t+1
end=list_time[t+1]
else:
ta1=datetime.strptime(list_time[t],'%Y%m%d%H%M%S')
ta1=ta1+timedelta(hours=1)
end=ta1.strftime('%Y%m%d%H%M%S')
programme_sub=ET.SubElement(new_root,"programme")
#programme_sub.tail='\n'
programme_sub.set("start",begin+" +0800")
programme_sub.set("stop",end+" +0800")
programme_sub.set("channel",channel_py[k])
programme_title=ET.SubElement(programme_sub,"title")
programme_title.set("lang","zh")
programme_title.text=tit
print("已经获取 "+k+"EPG内容")
saveXML(new_root,"e.xml")