Skip to main content
  1. Posts/

Crawl the duration and title of Bilibili videos

·220 words·2 mins·
Other Spider
misdazzling
Author
misdazzling
Infinite progress
import urllib.request
import gzip
import io
from lxml import etree
import json
import jsonpath

# 1.获取网页源码
# 请求地址
url = 'https://www.bilibili.com/video/BV1Es4y1q7Bf/?spm_id_from=333.788.comment.all.click&vd_source=3f0b4e269bbcd37b2419b130c0a77513'
# 请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Referer': 'https://www.bilibili.com/',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'TE': 'Trailers'
}
# 请求对象定制
request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 读取响应的内容
content_bytes = response.read()  # 从响应中读取的字节数据
# 响应内容是压缩的需要解压将压缩的字节数据解压并解码成UTF-8编码的字符串
content_io = io.BytesIO(content_bytes)  # 创建了一个io.BytesIO对象用于将字节数据包装成类似文件对象的形式
# 使用gzip.GzipFile来解压缩数据mode='rb'指定以二进制模式读取
with gzip.GzipFile(fileobj=content_io, mode='rb') as f:
    content = f.read().decode('utf-8')

# # 将网页源码保存到文件b站视频标题的获取xpath.html
with open('b站视频标题的获取xpath.html', 'w', encoding='UTF-8') as fp:
    fp.write(content)

# 2.获取视频标题和时长
# 1处理变成json数据
# 解析服务器响应的文件 etree.HTML
tree = etree.HTML(content)
# 获取想要的数据
the_data = tree.xpath('/html/head[@itemprop="video"]/script[5]/text()')[0]  # 由于tree.xpath返回的是列表需要使用切片[0]将它取出来
the_json_data = the_data.split('__=')[1].split(';(function')[0]

# 2处理json数据得到视频的标题和时长

# 使用jsonpath解析
video_data = json.loads(the_json_data)
pages = video_data['videoData']['pages']


# 3. 定义一个转换秒数为分:秒格式的函数
def convert_seconds_to_minutes_seconds_format(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    return f"{minutes:02}:{seconds:02}"


# 4. 打印每个视频的标题和时长并累加总时长
total_duration_seconds = 0  # 用于累加总时长

for page in pages:
    title = page['part']  # 获取视频的标题
    duration = page['duration']  # 获取视频的时长单位为秒

    # 转换时长为分:秒格式
    formatted_duration = convert_seconds_to_minutes_seconds_format(duration)

    # 打印每个视频的时长
    print(f"标题: {title}, 时长: {formatted_duration}")
    #print(formatted_duration)
    # 累加时长
    total_duration_seconds += duration

def convert_seconds_to_time_format(seconds):
    # 计算小时
    hours = seconds // 3600
    # 计算分钟
    minutes = (seconds % 3600) // 60
    # 计算剩余秒数
    seconds = seconds % 60
    # 返回格式化的字符串确保小时分钟秒都是两位数
    return f"{hours:02}:{minutes:02}:{seconds:02}"


# 5. 计算总时长并转换为时::秒格式

total_formatted_duration = convert_seconds_to_time_format(total_duration_seconds)
print(f"\n所有视频的总时长: {total_formatted_duration}")