抓取PPT網站上的資料
目標位置:
https://www.ptt.cc/bbs/Gossiping/index.html
- 先把html全抓回來
- 紀錄一個cookie是否年滿18歲的變數
- 套件抓取今天日期, 去掉開頭的 '0' 符合 PTT 網站格式
- 取得上一頁href
- 抓取所有文章
- 判斷文章是否是今天發佈
- 取得推文數
- 取得文章連結
- 取得標題
- 取得作者
- 把我們要的資料整理好後,存入一開始的空陣列 後回傳(return)回去
先把html全抓回來
import requests
domain_url = 'https://www.ptt.cc'
def get_ppt_page(url):
resp = requests.get(
url=url,
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
if __name__ == '__main__':
ppt_page = get_ppt_page(domain_url + '/bbs/Gossiping/index.html')
if ppt_page:
print(domain_url + '/bbs/Gossiping/index.html')
print("目前有文章")
紀錄一個cookie是否年滿18歲的變數
cookies={'over18': '1'} # 紀錄cookies 是否年滿18歲
import requests
domain_url = 'https://www.ptt.cc'
def get_ppt_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'} # 紀錄cookies 是否年滿18歲
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
if __name__ == '__main__':
ppt_page = get_ppt_page(domain_url + '/bbs/Gossiping/index.html')
if ppt_page:
print(domain_url + '/bbs/Gossiping/index.html')
print("目前有文章")
#取得今天日期, 去掉開頭的 '0' 符合 PTT 網站格式
import time
today = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
print(today)
取得上一頁href
data:image/s3,"s3://crabby-images/3321f/3321ff11d7f48bb04f9181c872ffc7a6c84593ca" alt=""
import requests
import time
from bs4 import BeautifulSoup
domain_url = 'https://www.ptt.cc'
def get_ppt_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'} # 紀錄cookies 是否年滿18歲
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_pageinfo(resdata):
soup = BeautifulSoup(resdata, 'html5lib')
#取得上一頁href
paging_div = soup.find('div', 'btn-group btn-group-paging')
print(paging_div)
prev_url = paging_div.find_all('a')[1]['href']
print(prev_url)
return prev_url
if __name__ == '__main__':
ppt_page = get_ppt_page(domain_url + '/bbs/Gossiping/index.html')
if ppt_page:
#print(domain_url + '/bbs/Gossiping/index.html')
articles = [] # 全部的今日文章
today = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
prev_href = get_pageinfo(ppt_page)
print(domain_url+prev_href)
data:image/s3,"s3://crabby-images/3b02c/3b02c5d44aa5dc9f9a43af4e712a09ffe6cbb6e0" alt=""
data:image/s3,"s3://crabby-images/5ef03/5ef0319fb8e3320d0d5be62fa2124b90fe3df392" alt=""
抓取所有文章
import requests
import time
from bs4 import BeautifulSoup
domain_url = 'https://www.ptt.cc'
def get_ppt_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'} # 紀錄cookies 是否年滿18歲
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_pageinfo(resdata,today):
soup = BeautifulSoup(resdata, 'html5lib')
#取得上一頁href
paging_div = soup.find('div', 'btn-group btn-group-paging')
#print(paging_div)
prev_url = paging_div.find_all('a')[1]['href']
#print(prev_url)
pptdata = [] # 儲存取得的文章資料
date_divs = soup.find_all('div', 'r-ent')
print(date_divs) #先抓取<div class="r-ent"></div>
return pptdata,prev_url
if __name__ == '__main__':
ppt_page = get_ppt_page(domain_url + '/bbs/Gossiping/index.html')
if ppt_page:
#print(domain_url + '/bbs/Gossiping/index.html')
articles = [] # 全部的今日文章
today = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
pptdata,prev_href = get_pageinfo(ppt_page,today)
print(domain_url+prev_href)
print(pptdata)
data:image/s3,"s3://crabby-images/95360/953608d4d91b8610e9d580574d8cd3a0299d156d" alt=""
判斷文章是否是今天發佈
for d in date_divs:
today_data = d.find('div', 'date').text.strip() == today
print(today_data)
if today_data : #判斷文章是否是今天發佈
print("取得推文數")
如果是今天發佈的會回傳 true,反之false
data:image/s3,"s3://crabby-images/4bcc1/4bcc16489bb6322530df99a056c7d53f0ed088b2" alt=""
def get_pageinfo(resdata,today):
soup = BeautifulSoup(resdata, 'html5lib')
#取得上一頁href
paging_div = soup.find('div', 'btn-group btn-group-paging')
#print(paging_div)
prev_url = paging_div.find_all('a')[1]['href']
#print(prev_url)
pptdata = [] # 儲存取得的文章資料
date_divs = soup.find_all('div', 'r-ent')
#print(date_divs) #先抓取<div class="r-ent"></div>
for d in date_divs:
today_data = d.find('div', 'date').text.strip() == today
print(today_data)
if today_data : #判斷文章是否是今天發佈
print("取得推文數")
return pptdata,prev_url
取得推文數
data:image/s3,"s3://crabby-images/59000/590003fdecdaff6e7d395f905a55185c4fc90397" alt=""
if today_data : #判斷文章是否是今天發佈
# 取得推文數
push_count = d.find('div', 'nrec').text
print(push_count)
data:image/s3,"s3://crabby-images/3d8b7/3d8b7e7b967ca38daeb666d0bf4af74a50d2ea84" alt=""
推文數,可能是'爆'或 'X1', 'X7', ...
data:image/s3,"s3://crabby-images/30e04/30e040b66f88ed58f189c8b226ddea26cf5be190" alt=""
如果是爆,可以給一個特別數字、如果是x開頭的也可以給一個數字
若不是, 不做任何事,push_num 保持為 0
push_num = 0
if push_count:
try:
push_num = int(push_count) # 轉換字串為數字
except ValueError:
if push_count == '爆':
push_num = 00
elif push_count.startswith('X'):
push_num = 99
print(push_num)
data:image/s3,"s3://crabby-images/b64d6/b64d6a827752fa62eb7a7e56bcd16f0e11656146" alt=""
取得文章連結
#判斷超連結是否存在
data:image/s3,"s3://crabby-images/4396c/4396c90d79f5463037501a98cdd0359775e1bbef" alt=""
#取得文章連結
if d.find('a'): # 有超連結,代表文章存在
href = d.find('a')['href']
print(href)
data:image/s3,"s3://crabby-images/31156/311566a801a361f5fcc1e35f88bed9b036050e5d" alt=""
取得標題
title = d.find('a').text
print("標題",title)
取得作者
author = d.find('div', 'author').text if d.find('div', 'author') else '' #作者有可能是空的
print("作者",author)
data:image/s3,"s3://crabby-images/127a4/127a49e5caa1ab2b87d08b566c4e5fa352ba7c07" alt=""
if d.find('a'): # 有超連結,代表文章存在
href = d.find('a')['href']
print("標題連結",href)
title = d.find('a').text
print("標題",title)
author = d.find('div', 'author').text if d.find('div', 'author') else '' #作者有可能是空的
print("作者",author)
把我們要的資料整理好後,存入一開始的空陣列 後回傳(return)回去
"標題"、"標題連結"、"作者"、"推文數"
pptdata.append({
'title': title,
'href': href,
'push_num': push_num,
'author': author
})
data:image/s3,"s3://crabby-images/f1d65/f1d65a154e3dcee5096a443d81e6149c969c7a24" alt=""
完整Code:
import requests
import time
from bs4 import BeautifulSoup
domain_url = 'https://www.ptt.cc'
def get_ppt_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'} # 紀錄cookies 是否年滿18歲
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_pageinfo(resdata,today):
soup = BeautifulSoup(resdata, 'html5lib')
#取得上一頁href
paging_div = soup.find('div', 'btn-group btn-group-paging')
#print(paging_div)
prev_url = paging_div.find_all('a')[1]['href']
#print(prev_url)
pptdata = [] # 儲存取得的文章資料
date_divs = soup.find_all('div', 'r-ent')
#print(date_divs) #先抓取<div class="r-ent"></div>
for d in date_divs:
today_data = d.find('div', 'date').text.strip() == today
#print(today_data)
if today_data : #判斷文章是否是今天發佈
# 取得推文數
push_count = d.find('div', 'nrec').text
print(push_count)
push_num = 0
if push_count:
try:
push_num = int(push_count) # 轉換字串為數字
except ValueError:
# 若轉換失敗,可能是'爆'或 'X1', 'X2', ...
# 若不是, 不做任何事,push_num 保持為 0
if push_count == '爆':
push_num = 00
elif push_count.startswith('X'):
push_num = 99
print("推文數",push_num)
# 取得文章連結
if d.find('a'): # 有超連結,代表文章存在
href = d.find('a')['href']
print("標題連結",href)
title = d.find('a').text
print("標題",title)
author = d.find('div', 'author').text if d.find('div', 'author') else '' #作者有可能是空的
print("作者",author)
pptdata.append({
'title': title,
'href': href,
'push_num': push_num,
'author': author
})
return pptdata,prev_url
if __name__ == '__main__':
ppt_page = get_ppt_page(domain_url + '/bbs/Gossiping/index.html')
if ppt_page:
#print(domain_url + '/bbs/Gossiping/index.html')
articles = [] # 全部的今日文章
today = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
pptdata,prev_href = get_pageinfo(ppt_page,today)
print(domain_url+prev_href)
print(pptdata)
Yiru@Studio - 關於我 - 意如