import requestsfrom bs4 import BeautifulSoup#将字符串转换为Python对象import pandas
as
pd
url =
'http://www.runoob.com/html/html-tutorial.html'
r= requests.get(url)
html=r.text.encode(r.encoding).decode()
soup =BeautifulSoup(html,
'lxml'
)#html放到beatifulsoup对象中l=[x.text
for
x in soup.findAll(
'h2'
)]#提取次标题中所有的文字df = pd.DataFrame(l,columns =[url])#将l变为DataFrame文件,列名为URLx=soup.findAll(
'a'
)[1]#查看第二个元素x.has_attr(
'href'
)#判断是都有href字符x.attrs[
'href'
]#获得超链接 attrs函数返回字典links = [i
for
i in soup.findAll(
'a'
)
if
i.has_attr(
'href'
)
and
i.attrs[
'href'
][0:5]==
'/html'
]#用
if
来做一个筛选relative_urls= set([i.attrs[
'href'
]
for
i in links])
absolute_urls={
'http://www.runoob.com'
+i
for
i in relative_urls}
absolute_urls.discard(url)#删除当前所在的urlfor i in absolute_urls:
ri= requests.get(i)
soupi =BeautifulSoup(ri.text.encode(ri.encoding),
'lxml'
)
li=[x.text
for
x in soupi.findAll(
'h2'
)]
dfi = pd.DataFrame(l,columns =[i])
df = df.join(dfi,how=
'outer'
)
df