1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
| import datetime import requests import warnings from bs4 import BeautifulSoup as bs from xml.etree.ElementTree import Element, SubElement, ElementTree
# 定义请求头,模拟浏览器访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' }
# 忽略 XMLParsedAsHTMLWarning warnings.filterwarnings('ignore')
# 获取广东电视台频道列表 def get_channels_gdtv(): url = 'http://epg.gdtv.cn/f/1.xml' # 获取广东电视台频道列表的XML文件 res = requests.get(url, headers=headers) res.encoding = 'utf-8' soup = bs(res.text, 'html.parser') contents = soup.select('channel') channels = []
for content in contents: channel_id = content.attrs['id'] name = content.ctitle.text.strip() # 使用 strip() 清理可能的空格和换行 channels.append({'id': channel_id, 'name': name}) return channels
# 处理节目标题中的 HTML 实体和标签 def clean_html(text): # 将 HTML 实体转换为普通字符(例如 & -> &) text = bs(text, "html.parser").get_text() return text.replace('<br />', '\n').replace('<br>', '\n').strip()
# 获取指定日期范围内的节目表 def get_epgs_gdtv(channel_id, start_date, end_date): epgs = [] current_date = start_date while current_date <= end_date: url = f'http://epg.gdtv.cn/f/{channel_id}/{current_date.strftime("%Y-%m-%d")}.xml' res = requests.get(url, headers=headers, timeout=8) res.encoding = 'utf-8' soup = bs(res.text, 'html.parser') epgs_contents = soup.select('content')
for epga in epgs_contents: starttime = datetime.datetime.fromtimestamp(int(epga.attrs['time1'])) endtime = datetime.datetime.fromtimestamp(int(epga.attrs['time2'])) title = clean_html(epga.get_text()) epgs.append({ 'title': title, 'start': starttime, 'end': endtime }) current_date += datetime.timedelta(days=1) return epgs
# 创建 XMLTV 文件 def create_epg_xml(channels, epg_data, output_file): tv = Element('tv')
# 添加频道信息 for channel in channels: channel_element = SubElement(tv, 'channel', id=channel['id']) display_name = SubElement(channel_element, 'display-name') display_name.text = channel['name']
# 添加节目表 for channel_id, programs in epg_data.items(): for program in programs: programme = SubElement(tv, 'programme', start=program['start'].strftime('%Y%m%d%H%M%S +0800'), stop=program['end'].strftime('%Y%m%d%H%M%S +0800'), channel=channel_id) title = SubElement(programme, 'title') title.text = program['title']
# 保存为文件 tree = ElementTree(tv) tree.write(output_file, encoding='utf-8', xml_declaration=True)
# 主函数 if __name__ == "__main__": today = datetime.datetime.now().date() start_date = today - datetime.timedelta(days=6) # 六天前 end_date = today + datetime.timedelta(days=2) # 两天后
print("获取频道列表...") channels = get_channels_gdtv()
epg_data = {} print("抓取节目表...") for channel in channels: channel_id = channel['id'] epg_data[channel_id] = get_epgs_gdtv(channel_id, start_date, end_date)
print("生成 XMLTV 文件...") output_file = "epg.xml" create_epg_xml(channels, epg_data, output_file) print(f"完成!EPG 文件已保存为 {output_file}")
|