import urllib.request
import gzip
import io
from lxml import etree
import json
import jsonpath
# 1.获取网页源码
# 请求地址
url = 'https://www.bilibili.com/video/BV1Es4y1q7Bf/?spm_id_from=333.788.comment.all.click&vd_source=3f0b4e269bbcd37b2419b130c0a77513'
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Referer': 'https://www.bilibili.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'Trailers'
}
# 请求对象定制
request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
# 读取响应的内容
content_bytes = response.read() # 从响应中读取的字节数据
# 响应内容是压缩的,需要解压,将压缩的字节数据解压并解码成UTF-8编码的字符串
content_io = io.BytesIO(content_bytes) # 创建了一个io.BytesIO对象,用于将字节数据包装成类似文件对象的形式
# 使用gzip.GzipFile来解压缩数据,mode='rb'指定以二进制模式读取
with gzip.GzipFile(fileobj=content_io, mode='rb') as f:
content = f.read().decode('utf-8')
# # 将网页源码保存到文件“b站视频标题的获取(xpath).html”中
with open('b站视频标题的获取(xpath).html', 'w', encoding='UTF-8') as fp:
fp.write(content)
# 2.获取视频标题和时长
# (1)处理变成json数据
# 解析服务器响应的文件 etree.HTML
tree = etree.HTML(content)
# 获取想要的数据
the_data = tree.xpath('/html/head[@itemprop="video"]/script[5]/text()')[0] # 由于tree.xpath返回的是列表,需要使用切片[0]将它取出来
the_json_data = the_data.split('__=')[1].split(';(function')[0]
# (2)处理json数据,得到视频的标题和时长
# 使用jsonpath解析
video_data = json.loads(the_json_data)
pages = video_data['videoData']['pages']
# 3. 定义一个转换秒数为分:秒格式的函数
def convert_seconds_to_minutes_seconds_format(seconds):
minutes = seconds // 60
seconds = seconds % 60
return f"{minutes:02}:{seconds:02}"
# 4. 打印每个视频的标题和时长,并累加总时长
total_duration_seconds = 0 # 用于累加总时长
for page in pages:
title = page['part'] # 获取视频的标题
duration = page['duration'] # 获取视频的时长(单位为秒)
# 转换时长为分:秒格式
formatted_duration = convert_seconds_to_minutes_seconds_format(duration)
# 打印每个视频的时长
print(f"标题: {title}, 时长: {formatted_duration}")
#print(formatted_duration)
# 累加时长
total_duration_seconds += duration
def convert_seconds_to_time_format(seconds):
# 计算小时
hours = seconds // 3600
# 计算分钟
minutes = (seconds % 3600) // 60
# 计算剩余秒数
seconds = seconds % 60
# 返回格式化的字符串,确保小时、分钟、秒都是两位数
return f"{hours:02}:{minutes:02}:{seconds:02}"
# 5. 计算总时长并转换为时:分:秒格式
total_formatted_duration = convert_seconds_to_time_format(total_duration_seconds)
print(f"\n所有视频的总时长: {total_formatted_duration}")