1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
import sys, urllib,string
import re,os
import threading
url=raw_input('input url:')
s=url.find("index.html")
str=url[0:s]
content = urllib.urlopen(url).read()
#获取链接与章节目录
re_result=re.compile(r'(?<=<a href=")\d+.html(?=")')
lianjie=re.findall(re_result,content)
#获取小说名称
re_title=re.compile(r'(?<=<div id="title">).+(?=</)')
re_title_result=re.findall(re_title,content)
# 章节重新排序
lianjie.sort()
#获取文章链接
i=0
wzsj=[]
for j in lianjie:
wzsj.append(str+lianjie[i])
i=i+1
# 获取文章内容
fb=open("d:\\"+ re_title_result[0]+".txt","w")
for b in wzsj:
nr=urllib.urlopen(b).read()
#获取文章标题
title_re=re.compile(r'(?<=<div id="title">).+(?=</)')
title=re.findall(title_re,nr)
#获取正文内容
b2=re.compile(r'(?<=<div id="content"><div id="adright"></div>)[\s|\S]*(?=</div>)')
nr2=re.findall(b2,nr)
#空白符与换行符处理
t=nr2[0].replace(r'<br />','\n')
t2=t.replace(r' ','')
print("Now DownLoad:"+title[0])
fb.write("\n\n\n"+title[0]+"\n\n"+t2)
fb.close()
|