完整教學

環境設定

安裝python requests, pprint, openpyxl 套件

1
2
3
4
import requests 
import pprint
import time
import openpyxl

python workbook

1
2
3
4
5
6
7
8
9
10
11
#new excel file
workbook = openpyxl.Workbook()

#use a sheet
sheet = workbook.active

#append the data
sheet.append(['one_post', screen_name, mid, text])

#save workbook
workbook.save('file_name.xlsx')

抓取接口

開啟開發人員工具後,進到微博個人主頁後可以在網路的部分(資料類型請選取XHR)抓到getIndex開頭的接口
(小提示:有時候會抓不到但是如果把網頁換成手機版就可以抓到了~ 還有嘗試過後發現getIndex和getInfo好像都是可以的!)

在抓到後可以在標頭內抓到要求URL

取得URL後就可以開始啦!

抓取文章

1
2
3
comments_url = 'https://m.weibo.cn/api/container/getIndex?jumpfrom=weibocom&type=uid&value=5627362571&containerid=1076035627362571&page=' + str(page)

onepage(comments_url)

因為微博一頁只有十則貼文,所以如果想要抓多點,可以改成這樣呦!

1
2
3
4
5
for page in range(1, 2):
time.sleep(1)
comments_url = 'https://m.weibo.cn/api/container/getIndex?jumpfrom=weibocom&type=uid&value=5627362571&containerid=1076035627362571&page=' + str(page)

onepage(comments_url)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#ten posts of one page 
def onepage(comments_url):
response = requests.get(comments_url)
comments_data = response.json()
cards = comments_data['data']['cards']

for card in cards:
if card.get('mblog'):
m_blog = card.get('mblog')
text = m_blog.get('text')
mid = m_blog.get('mid')
screen_name = m_blog.get('user').get('screen_name')

print([screen_name, mid, text])
sheet.append(['one_post', screen_name, mid, text])
print('\n')

在接口預覽的地方(data->1~10->cards->m_blog)可以查看完整的m_blog的所有資料,裏面有文章的詳細資料,可以自行決定個人想要抓取的資料有哪些
這邊選擇抓取用戶名、mid(每一條文章的id)、文章內容~

抓取熱評

點進文章後用關鍵字hotflow過濾後剩下的那些就是熱評啦!
觀察那些接口的URL可以發現連結會是:
https://m.weibo.cn/comments/hotflow?id= + mid + &mid= + mid + &max_id_type=0

而mid就是我們剛剛抓到的資料~(每一則文章的專屬id)
所以我們就可以利用這個特性……

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def hotcomment(mid):

sub_comment_url = 'https://m.weibo.cn/comments/hotflow?id=' + str(mid) + '&mid=' + mid + '&max_id_type=0'
sub_response = requests.get(sub_comment_url)
sub_data = sub_response.json()

data_list = sub_data['data']['data']

for data in data_list:
text = data['text']
mid = data['mid']
if 'more_info_users' in data:
screen_name = data['more_info_users'][0]['screen_name']

print('sub_commit:', [screen_name, mid, text])
sheet.append(['hot_comment', screen_name, mid, text])
print('\n')

抓取熱評啦!!

完整程式碼

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests 
import pprint
import time
import openpyxl

#new excel file
workbook = openpyxl.Workbook()

#use a sheet
sheet = workbook.active

#ten posts of one page
def onepage(comments_url):
response = requests.get(comments_url)

comments_data = response.json()
cards = comments_data['data']['cards']

for card in cards:
if card.get('mblog'):
m_blog = card.get('mblog')
text = m_blog.get('text')
mid = m_blog.get('mid')
screen_name = m_blog.get('user').get('screen_name')

print([screen_name, mid, text])
sheet.append(['one_post', screen_name, mid, text])
print('\n')

hotcomment(mid)

def hotcomment(mid):

sub_comment_url = 'https://m.weibo.cn/comments/hotflow?id=' + str(mid) + '&mid=' + mid + '&max_id_type=0'
sub_response = requests.get(sub_comment_url)
sub_data = sub_response.json()

data_list = sub_data['data']['data']

for data in data_list:
text = data['text']
mid = data['mid']
if 'more_info_users' in data:
screen_name = data['more_info_users'][0]['screen_name']

print('sub_commit:', [screen_name, mid, text])
sheet.append(['hot_comment', screen_name, mid, text])
print('\n')

for page in range(1, 2):
time.sleep(1)
comments_url = 'https://m.weibo.cn/api/container/getIndex?jumpfrom=weibocom&type=uid&value=5627362571&containerid=1076035627362571&page=' + str(page)

onepage(comments_url)

workbook.save('nine.xlsx')