Catalog
  1. 1. 豆瓣电影助手
    1. 1.1. 一.实现思路
豆瓣电影助手

豆瓣电影助手

  • 为用户爬取豆瓣电影详细信息

  • 列出相关资源地址,比如观影地址和网盘并实现跳转

  • 数据存储

一.实现思路

  • 排行榜爬取

    ​ 首先Chorme F12,按照图示依次点击,逐渐下滑页面可以发现是动态刷新的,并且点击preview有相应的json数据。

    ​ 由此我们便可得出爬取页面的数据,相应得出代码中的url:

    url = 'https://movie.douban.com/j/chart/top_list?type=' + str(
    typeId) + '&interval_id=100:90&action=unwatched&start=0&limit=' + str(movie_count)
  • 关键字爬取

    这次操作可不能像排行榜那样进行爬取了,因为关键字搜索页面没有jsondata,所以我们只能用最普通的方式进行爬取,也就是分析网页标签的方式进行爬取。

诸如其上的 “href” “rating_nums” “meta abstract”等。

​ 数据爬取之后,接下来就是一些数据的存储和转化,此处不再赘述。

二.效果展示

​ 主界面

  • ​ 排行榜查询

    可以添加类型,数量,评分,评价人数等限定

  • 关键字查询

    直接查询影片名称,并返回和其相似的电影

我爬取的数据来源于豆瓣官方,所以我能爬取到的排行榜最大数据等同于官网数据:

爬取数据

豆瓣官网

可以看到本地数据和官网数据针对”剧情“片,均为686条。

  • 云盘和在线观看

    双击可直接跳转到此电影对应的网盘和观影页面。

  • 无gui版

  • 数据存储

此处,附上无gui版本代码

(若想运行此代码,需要安装相应库和Chrome dirver)

# -*- coding:utf-8 -*-
from ssl import _create_unverified_context
from json import loads
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tkinter.messagebox

import urllib.request
import urllib.parse

movieData = ' [' \
'{"title":"纪录片", "type":"1", "interval_id":"100:90"}, ' \
' {"title":"传记", "type":"2", "interval_id":"100:90"}, ' \
' {"title":"犯罪", "type":"3", "interval_id":"100:90"}, ' \
' {"title":"历史", "type":"4", "interval_id":"100:90"}, ' \
' {"title":"动作", "type":"5", "interval_id":"100:90"}, ' \
' {"title":"情色", "type":"6", "interval_id":"100:90"}, ' \
' {"title":"歌舞", "type":"7", "interval_id":"100:90"}, ' \
' {"title":"儿童", "type":"8", "interval_id":"100:90"}, ' \
' {"title":"悬疑", "type":"10", "interval_id":"100:90"}, ' \
' {"title":"剧情", "type":"11", "interval_id":"100:90"}, ' \
' {"title":"灾难", "type":"12", "interval_id":"100:90"}, ' \
' {"title":"爱情", "type":"13", "interval_id":"100:90"}, ' \
' {"title":"音乐", "type":"14", "interval_id":"100:90"}, ' \
' {"title":"冒险", "type":"15", "interval_id":"100:90"}, ' \
' {"title":"奇幻", "type":"16", "interval_id":"100:90"}, ' \
' {"title":"科幻", "type":"17", "interval_id":"100:90"}, ' \
' {"title":"运动", "type":"18", "interval_id":"100:90"}, ' \
' {"title":"惊悚", "type":"19", "interval_id":"100:90"}, ' \
' {"title":"恐怖", "type":"20", "interval_id":"100:90"}, ' \
' {"title":"战争", "type":"22", "interval_id":"100:90"}, ' \
' {"title":"短片", "type":"23", "interval_id":"100:90"}, ' \
' {"title":"喜剧", "type":"24", "interval_id":"100:90"}, ' \
' {"title":"动画", "type":"25", "interval_id":"100:90"}, ' \
' {"title":"同性", "type":"26", "interval_id":"100:90"}, ' \
' {"title":"西部", "type":"27", "interval_id":"100:90"}, ' \
' {"title":"家庭", "type":"28", "interval_id":"100:90"}, ' \
' {"title":"武侠", "type":"29", "interval_id":"100:90"}, ' \
' {"title":"古装", "type":"30", "interval_id":"100:90"}, ' \
' {"title":"黑色电影", "type":"31", "interval_id":"100:90"}' \
']'


class getMovieInRankingList:


def __init__(self):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument(
'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"') # 设置user=agent
chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])
chrome_options.add_experimental_option("prefs",{"profile.managed_default_content_settings.images": 2})

try:
self.browser = webdriver.Chrome(executable_path='C:\chromedriver_win32\chromedriver.exe',options=chrome_options)
self.wait = WebDriverWait(self.browser, 10)
except:
print("chromedriver.exe出错,请检查是否与你的chrome浏览器版本相匹配\n缺失chromedriver.exe不会导致从排行榜搜索功能失效,但会导致从关键字搜索功能失效")


def get_url_data_in_ranking_list(self, typeId,title, movie_count, rating, vote_count):
context = _create_unverified_context()
headers = {#伪装UA绕过检测
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', }
url = 'https://movie.douban.com/j/chart/top_list?type=' + str(
typeId) + '&interval_id=100:90&action=unwatched&start=0&limit=' + str(movie_count) #此url可经F12分析出

req = urllib.request.Request(url=url, headers=headers)
f = urllib.request.urlopen(req, context=context)
response = f.read()
jsonData = loads(response)

list = []
for subData in jsonData:
if ((float(subData['rating'][0]) >= float(rating))and(float(subData['vote_count'])>=float(vote_count))):
subList = []
subList.append(subData['title'])
subList.append(subData['rating'][0])
subList.append(subData['rank'])
subList.append(subData['vote_count'])
list.append(subList)
for data in list:
print(data)

f = open("./" + title+ "_"+rating+"_"+vote_count+".txt", "w", encoding="utf-8")
sample_list = [str(line) + '\n' for line in list]
f.writelines(sample_list)
f.close()
print('共爬取到' + str(list.__len__()) + '条数据')
return list, jsonData


# 从关键字获取电影数据
def get_url_data_in_keyWord(self, key_word):
# 浏览网页
self.browser.get('https://movie.douban.com/subject_search?search_text=' + urllib.parse.quote(
key_word) + '&cat=1002') # get方式获取返回数据


self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.root')))

dr = self.browser.find_elements_by_xpath("//div[@class='item-root']") # 获取class为item-root的DIV(因为有多个结果)
jsonData = []
list = []
for son in dr:
movieData = {'rating': ['', 'null'], 'cover_url': '', 'types': '', 'title': '', 'url': '',
'release_date': '', 'vote_count': '', 'actors': ''}
subList = ['', '', '', '']

url_element = son.find_elements_by_xpath(".//a") # 获取第一个a标签的url(因为有多个结果)
if (url_element):
movieData['url'] = (url_element[0].get_attribute("href"))

img_url_element = url_element[0].find_elements_by_xpath(".//img") # 获取影片海报图片地址
if (img_url_element):
movieData['cover_url'] = (img_url_element[0].get_attribute("src"))

title_element = son.find_elements_by_xpath(".//div[@class='title']") # 获取标题
if (title_element):
temp_title = (title_element[0].text)
movieData['title'] = (temp_title.split('('))[0]
movieData['release_date'] = temp_title[temp_title.find('(') + 1:temp_title.find(')')]
subList[0] = movieData['title']

rating_element = son.find_elements_by_xpath(".//span[@class='rating_nums']") # 获取评分
if (rating_element):
movieData['rating'][0] = (rating_element[0].text)
subList[1] = movieData['rating'][0]

vote_element = son.find_elements_by_xpath(".//span[@class='pl']") # 获取评价数
if (vote_element):
movieData['vote_count'] = (vote_element[0].text).replace('(', '').replace(')', '').replace('人评价', '')
subList[3] = movieData['vote_count']

type_element = son.find_elements_by_xpath(".//div[@class='meta abstract']") # 获取类型
if (type_element):
movieData['types'] = (type_element[0].text)
subList[2] = movieData['types']

actors_element = son.find_elements_by_xpath(".//div[@class='meta abstract_2']") # 获取演员
if (actors_element):
movieData['actors'] = (actors_element[0].text)

jsonData.append(movieData)
list.append(subList)
self.browser.quit()
for data in list:
print(data)
print('共爬取到'+str(list.__len__())+'条数据')
f = open("./" +key_word+".txt", "w",encoding="utf-8")
sample_list = [str(line)+'\n' for line in list]
f.writelines(sample_list)
f.close()
return list, jsonData

def BinarySearch(strs,filename):
f = open(filename,"r",encoding='utf-8')
line = f.readline()
cnt = 0
dic = {}
list = []
while line:
line = line[2:line.find("'",2)]
line = line.rstrip()
dic[line] = cnt
cnt = cnt+1
line = f.readline()
low = 0
high = cnt-1
flag = 0
for key in dic.keys():
if key == strs:
flag = 1
break
if flag == 0:
print("不存在这部电影")
return 0

while low <= high:
mid = (low+high)/2
if dic[strs] < mid:
high = mid - 1
elif dic[strs] > mid:
low = mid +1
else:
print('找到电影,其在对应文件的行号为' ,int(mid+1))
return 0

if __name__ == "__main__":
print('**************1.排行榜爬取***************')
print('**************2.电影名称爬取**************')
print('**************3.退出********************')
choice = input('请选择')

while choice != '3':
if(choice == '2'):
print('蜘蛛侠即将到达战场,请稍等···')
movie = getMovieInRankingList()
moviename = input("请输入要爬取的电影的名称:")
print('数据爬取并写入文件,请等待···')
movie.get_url_data_in_keyWord(moviename)
strs = input('请输入搜索的电影名称(二分查找):')
filename = moviename+'.txt'
res = BinarySearch(strs,filename)
print('------------------------------------------------')
elif(choice == '1'): #若运行choice == 1此部分请参照代码顶部的movieData表
#typeId, title, movie_count, rating, vote_count
print('蜘蛛侠即将到达战场,请稍等···')
movie = getMovieInRankingList()
typeId = input('请输入类型:')
title = input('请输入电影类别:')
movie_count = input('请输入爬取数量:')
rating = input('请输入评分:')
vote_count = input('请输入评论数:')
movie.get_url_data_in_ranking_list(typeId,title,movie_count,rating,vote_count)
print('数据爬取并写入文件,请等待···')
print('------------------------------------------------')

choice = input('请选择')

gui版本代码大概有600 多行 ,太长了,此处不予公示示(手动狗头)

这个tkinter 没有我的qt好使(再次狗头)

(tkinter 对于gui要求不高的还可,优点是效率较pyqt和wxpython高,缺点是一个字

Author: superzhaoyang
Link: http://yoursite.com/2020/03/31/DoubanMovie/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.
Donate
  • 微信
  • 支付宝

Comment