import requests
from bs4 import BeautifulSoup as bs
import time
import arrow
import pandas
request_headers = {
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36' \
'(KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36',
}
post_url_list = []
for n in range(1, 4):
print('[Page {}]'.format(n))
url = 'http://www.setn.com/ViewAll.aspx?p={}'.format(n)
r = requests.get(url, headers=request_headers)
time.sleep(2)
soup = bs(r.text, 'lxml')
for t in soup.select('.box li')[:3]:
a_tag = t.select('a')[0]
url_tail = a_tag['href']
post_url = 'http://www.setn.com{}'.format(url_tail)
post_url_list.append(post_url)
post_list = []
for p_url in post_url_list:
print(p_url)
r = requests.get(p_url, headers=request_headers)
time.sleep(2)
soup = bs(r.text, 'lxml')
post_time_str = soup.select('meta[name="pubdate"]')[0]['content']
post_time = arrow.get(post_time_str).replace(tzinfo='local').datetime
dic = {
'url':p_url,
'category':soup.select('#toptitle a')[1].text,
'title':soup.select('div.title > h1')[0].text,
'post_time':post_time,
'content':soup.select('#Content1')[0].text.strip()
}
post_list.append(dic)
df = pandas.DataFrame(post_list)
df