''
'
#此处代码为普通爬虫
import urllib.request
import urllib.error
import re
headers = (
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for
i in range(1,2):
url =
"https://www.qiushibaike.com/8hr/page/"
+str(i)+
"/"
pagedata = urllib.request.urlopen(url).read().decode(
"utf-8"
,
"ignore"
)
pattern =
'<p class="content">.*?<span>(.*?)</span>(.*?)</p>'
datalist = re.compile(pattern,re.S).findall(pagedata)
for
j in range(0,len(datalist)):
print
(
"第"
+str(i)+
"页第"
+str(j)+
"个段子内容是:"
)
print
(datalist[j])
''
'
''
'
#此处为多线程介绍代码
import threading #导入多线程包
class
A(threading.Thread): #创建一个多线程A
def init(self): #必须包含的两个方法之一:初始化线程
threading.Thread.init(self)
def run(self): #必须包含的两个方法之一:线程运行方法
for
i in range(0,11):
print
(
"我是线程A"
)
class
B(threading.Thread): #创建一个多线程A
def init(self): #必须包含的两个方法之一:初始化线程
threading.Thread.init(self)
def run(self): #必须包含的两个方法之一:线程运行方法
for
i in range(0,11):
print
(
"我是线程B"
)
t1 = A() #线程实例化
t1.start() #线程运行
t2 = B()
t2.start()
''
'
#此处为修改后的多线程爬虫
#使用多线程进行奇偶页的爬取
import urllib.request
import urllib.error
import re
import threading
headers = (
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
class
one(threading.Thread): #爬取奇数页内容
def init(self):
threading.Thread.init(self)
def run(self):
for
i in range(1,12,2):
url =
"https://www.qiushibaike.com/8hr/page/"
+str(i)+
"/"
pagedata = urllib.request.urlopen(url).read().decode(
"utf-8"
,
"ignore"
)
pattern =
'<p class="content">.*?<span>(.*?)</span>(.*?)</p>'
datalist = re.compile(pattern,re.S).findall(pagedata)
for
j in range(0,len(datalist)):
print
(
"第"
+str(i)+
"页第"
+str(j)+
"段子内容为:"
)
print
(datalist[j])
class
two(threading.Thread): #爬取奇数页内容
def init(self):
threading.Thread.init(self)
def run(self):
for
i in range(2,12,2):
url =
"https://www.qiushibaike.com/8hr/page/"
+str(i)+
"/"
pagedata = urllib.request.urlopen(url).read().decode(
"utf-8"
,
"ignore"
)
pattern =
'<p class="content">.*?<span>(.*?)</span>(.*?)</p>'
datalist = re.compile(pattern,re.S).findall(pagedata)
for
j in range(0,len(datalist)):
print
(
"第"
+str(i)+
"页第"
+str(j)+
"段子内容为:"
)
print
(datalist[j])
t1 = one()
t2 = two()
t1.start()
t2.start()