kuwo批量获取所有歌手信息过程记录

效果(主要使用charles和python实现)

批量获取kuwo的所有歌手数据,共169页
歌手列表

保存的数据

操作步骤

1、使用抓包工具Charles 的mirror功能

工具:Charles 的mirror

点击菜单栏Tools,弹出菜单选择Mirror,点开mirror窗口后,设置保存路径和url

image.png

image.png

2、然后打开浏览器访问kuwo歌手的列表页,就可以在目录中看到下载文件了

image.png

3、rename.py对下载的文件进行批量重命名

注意:rename.py 与下载的歌手数据的目录结构,rename.py 应与存放歌手数据文件的目录名同目录,这里是 artist_11

image.png

rename.py 代码


import os
import re
import urllib.parse
import sys


# 下载后的歌手文件,批量重命名
#例如:
#原名字:artistInfo%3fcategory%3d0%26prefix%3d%26pn%3d1%26rn%3d60%26httpsStatus%3d1%26reqId%3d3b128760-1ce3-11f1-b0ff-171b6be4e54a%26plat%3dweb_www%26from%3d
#新名字:pn1.json
def rename_files(directory='.'):
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
if not os.path.isfile(filepath):
continue

# 先尝试直接匹配编码后的 pn%3d数字
match = re.search(r'pn%3d(\d+)', filename, re.IGNORECASE)
if not match:
# 如果匹配不到,尝试解码后匹配 pn=数字
decoded = urllib.parse.unquote(filename)
match = re.search(r'pn=(\d+)', decoded, re.IGNORECASE)

if match:
page_num = match.group(1)
new_name = f'pn{page_num}.json'
new_path = os.path.join(directory, new_name)

# 如果新文件名已存在,添加数字后缀避免覆盖
counter = 1
while os.path.exists(new_path):
base, ext = os.path.splitext(new_name)
new_name = f'{base}_{counter}{ext}'
new_path = os.path.join(directory, new_name)
counter += 1

os.rename(filepath, new_path)
print(f'Renamed: {filename} -> {new_name}')

if __name__ == '__main__':
target_dir = sys.argv[1] if len(sys.argv) > 1 else './artist_11'
rename_files(target_dir)

使用python,自动调用浏览器模拟自动点击页码,这里只点击到169页

from selenium import webdriver  
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import sys
import json
import re

from selenium.webdriver.common.keys import Keys

import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd

import select
import os
import hashlib

def openMiguHtml(miguUrl):
driverpath = './res/chrom_driver/chromedriver-mac-x64/chromedriver'
chromepath = "./res/chrom/chrome-mac-x64/chrometest.app/Contents/MacOS/GoogleChromeTesting";
profile = './res/profile/Default'
os.makedirs(profile, exist_ok=True)
chrome_options = Options()
chrome_options.add_argument(f"--user-data-dir={profile}")
driver = uc.Chrome(options=chrome_options, driver_executable_path=driverpath, browser_executable_path=chromepath)
driver.get(miguUrl)
return driver

def scrollToElement(driver, element) :
driver.execute_script("arguments[0].scrollIntoView();", element)


# 模拟点击元素
def clickElement(driver, element) :
scrollToElement(driver, element)
time.sleep(1)
# 获取鼠标指针的当前位置
mouse_position = driver.execute_script("return {x: window.screenX, y: window.screenY}")
# 构建鼠标点击的动作链
actions = ActionChains(driver)
actions.move_to_element(element) # 将鼠标指针移动到div元素的位置
actions.click() # 触发点击事件
# 执行动作链并等待点击事件完成
actions.perform()

#等待用户确认函数
def wait_for_enter(prompt="按回车键继续..."):
"""等待用户按下回车键,并清除输入缓冲区"""
try:
input(prompt)
except EOFError: # 处理没有控制台的情况
pass
# 清除可能存在的多余输入
sys.stdin.flush()

def wait_element_appeared(driver, xpath_text) :

print(f"\n{xpath_text}\n")
times = 1
while True:
time.sleep(1)
try:
#driver.find_element(By.XPATH,'//span[contains(text(),"编辑")]')
ele = driver.find_element(By.XPATH, xpath_text)
times += 1
return ele;
except Exception as e:
print(f"等待标签出现{times}秒")
times+=1;
if times > 60 :
print("60秒结束等待")
return None


import time
import random

def click_next_page_in_loop(driver):
"""
循环点击下一页按钮,每次点击后随机等待5~10秒
:param driver: selenium的WebDriver实例
"""
# 下一页按钮的XPath
next_page_xpath = "//i[@class='li-page iconfont icon-icon_pagedown']"

i = 1
while True:
try:
if i >= 169 :
print("循环结束")
break;

print(f"开始点击第 {i+1} 页")
next_ele = wait_element_appeared(driver, "//i[@class='li-page iconfont icon-icon_pagedown']")
# 随机等待5~10秒
wait_time = random.randint(5, 10)
print(f"等待 {wait_time} 秒后继续...")
time.sleep(wait_time)

clickElement(driver, next_ele)

i = i + 1;

except Exception as e:
# 捕获所有异常(元素不存在/超时/点击失败等),终止循环
print(f"循环终止:{str(e)}")
break

driver = openMiguHtml('https://www.kuwo.cn/singers')
click_next_page_in_loop(driver)
微信打赏

意外收获认可,内心充满感激;打赏之举,更添无限动力。