目標網站:https://movies.yahoo.com.tw/movie_thisweek.html
(完整程式碼)
#目標位置>>Yahoo>>電影>>Yahoo本週新片
#https://tw.movies.yahoo.com/movie_thisweek.html
import requests
import re
from bs4 import BeautifulSoup
#Yahoo電影
yahoo_movie_url = 'https://tw.movies.yahoo.com/movie_thisweek.html' #目標位置
def check_req_url(url): #測試請求網址是否請求成功
resp = requests.get(url) #請求網址
#print(resp.status_code) #錯誤時404,成功時200
if resp.status_code != 200: #如果請求失敗
print('Invalid url:', resp.url) #印出請求失敗的網址
return "fail" #回傳失敗提示訊息
else:
return resp.text #回傳請求成功的html文字
def get_week_new_movies(webpage): #抓取電影資訊
soup = BeautifulSoup(webpage, 'html5lib') #網頁解析
movies = [] #域設電影資訊存這裡
#抓取<div class="release_info_text"></div>內文字
rows = soup.find_all('div', 'release_info_text')
data_movie = dict()
#print(rows)
for row in rows:
data_movie = dict() #存成{"key":"value"}格式
#電影名稱
data_movie['ch_name'] = row.find('div', 'release_movie_name').a.text.strip()
#英文名稱
data_movie['english_name'] = row.find('div', 'release_movie_name').find('div', 'en').a.text.strip()
#電影介紹
data_movie['info'] = row.find('div', 'release_text').text.strip()
#期待度
data_movie['expectation'] = row.find('div', 'leveltext').span.text.strip()
#上映日期 只抓日期 "上映日期:2020-11-20" -> match.group(0): "2020-11-20"
data_movie['release_date'] = get_date(row.find('div', 'release_movie_time').text)
#海報圖片(需要在原本路徑下在回上一個層級)
data_movie['poster_url'] = row.parent.find_previous_sibling('div', 'release_foto').a.img['src']
#電影ID
data_movie['movie_id'] = get_movie_id(row.find('div', 'release_movie_name').a['href'])
#預告片
trailer_a = row.find_next_sibling('div', 'release_btn color_btnbox').find_all('a')[1]
#print(trailer_a)
data_movie['trailer_url'] = trailer_a['href'] if 'href' in trailer_a.attrs.keys() else ''
#print(data_movie)#每一圈 都會被取代成下一筆資料 data_movie['ch_name']
#第一圈
#data_movie['ch_name']={'ch_name': '逃'}
#第二圈
#data_movie['ch_name']={'ch_name': '名偵探柯南:紅之校外旅行 鮮紅篇&戀紅篇'}
#第三圈
#data_movie['ch_name']={'ch_name': '惡童當街 經典重映'}
movies.append(data_movie) #再被取代前先存入for 外面的movies=[]
#第一圈存入
#[{'ch_name': '逃'}]
#第二圈存入
#[{'ch_name': '逃'}, {'ch_name': '名偵探柯南:紅之校外旅行 鮮紅篇&戀紅篇'}]
#第三圈存入
#[{'ch_name': '逃'}, {'ch_name': '名偵探柯南:紅之校外旅行 鮮紅篇&戀紅篇'}, {'ch_name': '惡童當街 經典重映'}]
#print(movies)
return movies
def get_movie_id(url):
try:
movie_id = url.split('-')[-1]
#print(movie_id)
except:
movie_id = url
return movie_id
def get_date(date_str):
# e.x. "上映日期:2017-03-23" -> match.group(0): "2020-11-20"
#記得import re
#1.\d找到數字
#2.要找一整組的所以\d+,+代表1個以上
#3.有可能有'-'
pattern = '\d+-\d+-\d+'
match = re.search(pattern, date_str)
#print(match)
#print(match.group(0))
if match is None:
return date_str
else:
return match.group(0)
if __name__ == '__main__':
webpage = check_req_url(yahoo_movie_url)
#print(webpage)
if webpage:
movies = get_week_new_movies(webpage)
#print(movies)
Yiru@Studio - 關於我 - 意如