跳过正文
  1. Posts/

爬取哔哩哔哩视频时长和标题

·222 字·2 分钟·
杂项 爬虫
misdazzling
作者
misdazzling
无限进步

代码
#

import urllib.request
import gzip
import io
from lxml import etree
import json
import jsonpath

# 1.获取网页源码
# 请求地址
url = 'https://www.bilibili.com/video/BV1Es4y1q7Bf/?spm_id_from=333.788.comment.all.click&vd_source=3f0b4e269bbcd37b2419b130c0a77513'
# 请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Referer': 'https://www.bilibili.com/',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'TE': 'Trailers'
}
# 请求对象定制
request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 读取响应的内容
content_bytes = response.read()  # 从响应中读取的字节数据
# 响应内容是压缩的需要解压将压缩的字节数据解压并解码成UTF-8编码的字符串
content_io = io.BytesIO(content_bytes)  # 创建了一个io.BytesIO对象用于将字节数据包装成类似文件对象的形式
# 使用gzip.GzipFile来解压缩数据mode='rb'指定以二进制模式读取
with gzip.GzipFile(fileobj=content_io, mode='rb') as f:
    content = f.read().decode('utf-8')

# # 将网页源码保存到文件b站视频标题的获取xpath.html
with open('b站视频标题的获取xpath.html', 'w', encoding='UTF-8') as fp:
    fp.write(content)

# 2.获取视频标题和时长
# 1处理变成json数据
# 解析服务器响应的文件 etree.HTML
tree = etree.HTML(content)
# 获取想要的数据
the_data = tree.xpath('/html/head[@itemprop="video"]/script[5]/text()')[0]  # 由于tree.xpath返回的是列表需要使用切片[0]将它取出来
the_json_data = the_data.split('__=')[1].split(';(function')[0]

# 2处理json数据得到视频的标题和时长

# 使用jsonpath解析
video_data = json.loads(the_json_data)
pages = video_data['videoData']['pages']


# 3. 定义一个转换秒数为分:秒格式的函数
def convert_seconds_to_minutes_seconds_format(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    return f"{minutes:02}:{seconds:02}"


# 4. 打印每个视频的标题和时长并累加总时长
total_duration_seconds = 0  # 用于累加总时长

for page in pages:
    title = page['part']  # 获取视频的标题
    duration = page['duration']  # 获取视频的时长单位为秒

    # 转换时长为分:秒格式
    formatted_duration = convert_seconds_to_minutes_seconds_format(duration)

    # 打印每个视频的时长
    print(f"标题: {title}, 时长: {formatted_duration}")
    #print(formatted_duration)
    # 累加时长
    total_duration_seconds += duration

def convert_seconds_to_time_format(seconds):
    # 计算小时
    hours = seconds // 3600
    # 计算分钟
    minutes = (seconds % 3600) // 60
    # 计算剩余秒数
    seconds = seconds % 60
    # 返回格式化的字符串确保小时分钟秒都是两位数
    return f"{hours:02}:{minutes:02}:{seconds:02}"


# 5. 计算总时长并转换为时::秒格式

total_formatted_duration = convert_seconds_to_time_format(total_duration_seconds)
print(f"\n所有视频的总时长: {total_formatted_duration}")