python爬虫第三弹,自动下载小说并储存为txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# coding:utf-8
__author__ = 'yangpeiwen'

from urllib import *
from bs4 import BeautifulSoup
import socket
import os

url = "http://wap.jdxs.net/index.php/book/chapter/bid=100722/cid=16819732/"
socket.setdefaulttimeout(3)
path = "xs"

socket.setdefaulttimeout(3) # 3秒超时
if os.path.exists(path) is False:
os.makedirs(path)
# 如果没有这个文件夹就创建一个
f = open(path+"/xs.txt", "w")

always = True
while always:
ci = 0
try:
html = urlopen(url).read()
soup = BeautifulSoup(html)
text = soup.find(attrs={'class': 'chapter'}).text
print text
f.write(text.encode("utf-8"))
if soup.find(id='btnNext').text.encode('utf-8').index('下一章') == 0:
url = "http://wap.jdxs.net/" + soup.find(id='btnNext')['href']
print url
else:
break
except IOError:
ci += 1
if ci > 10:
always = False
f.close()