『楼 主』:
[分享][原创]多线程网页下载脚本
欢迎试用,本脚本程序采用灵活的函数方式对下载方案进行控制,所以使用本脚本需要一定的脚本知识.
请将以下内容存为 "多线程网页下载脚本.HTA"
<!--
版本 1.1
作者 sonic_andy
-->
<HTA:APPLICATION
APPLICATIONNAME="AUTODOWNLOAD"
BORDER="THICK"
CAPTION="YES"
CONTEXTMENU="YES"
NAVIGABLE="NO"
SCROLL="NO"
SELECTION="NO"
SINGLEINSTANCE="YES"
SYSMENU="YES"
SHOWINTASKBAR="YES"
VERSION="1.0"
WINDOWSTATE="NORMAL"
>
<head>
<meta http-equiv=Content-Type content="text/html;charset=gb2312">
<title>多线程网页下载脚本<sonic_andy></title>
<script language="vbscript">
Option Explicit ' 严格语法
Const INITIALIZED = "初始状态"
Const RUNNING = "执行中"
Const PAUSED = "暂停状态"
Const EVENT_INIT = "初始化事件"
Const EVENT_START = "开始事件"
Const EVENT_PAUSE = "暂停事件"
Const EVENT_STOP = "停止事件"
Dim fcount ' 已保存文件计数
Dim depth ' 当前深度
Dim id ' 本程序中网页元素集合
Dim taskQueue ' 任务队列
Dim oIntervalRun' 执行函数终止句柄
Dim cTaskDone ' 任务是否完成
Dim cie ' InternetExplorer集合对象
Dim stream ' Ado的stream对象,用来保存文件(转换内码为Unicode)
Dim depthCount ' 当前深度的剩余网址个数
Dim regex ' 正则表达式对象
Dim status ' 当前状态
Dim urlDone ' 已经下载的url
Dim cTaskNum ' 任务编号
Dim tcount ' 任务计数
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 初始化/重置 函数
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 初始化
Sub initialize()
Set id = document.body.all
Set stream = CreateObject("adodb.stream")
Set cie = CreateObject("scripting.dictionary")
Set cTaskDone = CreateObject("scripting.dictionary")
Set cTaskNum = CreateObject("scripting.dictionary")
Set regex = new RegExp
regex.global = True
regex.ignorecase = True
id.status.contentEditable = False
Call reset()
Call resize()
End Sub
' 重置变量
Sub reset()
fcount = 0
tcount = 0
depth = 0
depthCount = 0
taskQueue = ""
urlDone = ""
While cie.count > 0
' 如果ie窗口被手动关闭,这里可能会抛出异常
On Error Resume Next
Call cie.items()(0).quit()
On Error GoTo 0
Call cie.remove(cie.keys()(0))
Wend
Call cTaskDone.removeAll()
Call cTaskNum.removeAll()
Call processEvent(EVENT_INIT)
End Sub
Sub resize()
Dim cwidth,bwidth
cwidth = Document.body.clientwidth - 10
bwidth = 100
id.code.style.height = Document.body.clientHeight - 85
id.code.style.width = cwidth
id.status.style.width = cwidth
id.start.style.width = bwidth
id.pause.style.width = bwidth
id.stop.style.width = bwidth
id.example.style.width = bwidth
End Sub
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 功能函数
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Function isAllTaskDone()
Dim i
For i=0 To cTaskDone.count-1
If Not cTaskDone(i) Then
isAllTaskDone = False
Exit Function
End If
Next
isAllTaskDone = True
End Function
Function taskCount()
If taskQueue = "" Then
taskCount = 0
Else
taskCount = UBound(Split(taskQueue,vbcrlf)) + 1
End If
End Function
' 更新状态
Sub updateStatus()
id.status.value = "当前状态:" & status & " 深度:" & depth & " 已完成:" & tcount & " 已保存:" & fcount & _
" 队列长度:" & taskCount() & " 本级个数:" & depthCount & " 线程数:" & cie.count
End Sub
Sub quit()
Call reset()
End Sub
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 状态函数
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' <状态说明>
' 状态 描述 [开始] [暂停] [停止]
' (1) INITIALIZED *
' (2) RUNNING * *
' (3) PAUSED * *
' FINISHED = INITIALIZED
' 更新按钮
Sub updateButton()
with id
Select Case status
Case INITIALIZED
.start.disabled = False
.pause.disabled = True
.stop.disabled = True
Case RUNNING:
.start.disabled = True
.pause.disabled = False
.stop.disabled = False
Case PAUSED:
.start.disabled = False
.pause.disabled = True
.stop.disabled = False
End Select
End with
End Sub
' <状态转换>
' (1)-[开始]->(2)
' (2)-[暂停]->(3)
' (2)-[停止]->(1)
' (3)-[开始]->(2)
' (3)-[停止]->(1)
Sub processEvent(e)
If e = EVENT_INIT Then
status = INITIALIZED
ElseIf status=INITIALIZED And e=EVENT_START Then
status = RUNNING
ElseIf status=INITIALIZED And e=EVENT_PAUSE Then
status = INITIALIZED
ElseIf status=RUNNING And e=EVENT_PAUSE Then
status = PAUSED
ElseIf status=RUNNING And e=EVENT_STOP Then
status = INITIALIZED
ElseIf status=PAUSED And e=EVENT_START Then
status = RUNNING
ElseIf status=PAUSED And e=EVENT_STOP Then
status = INITIALIZED
Else
Call MsgBox("状态:" & status & "意外事件:" & e)
End If
Call updateButton()
Call updateStatus()
End Sub
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 事件处理函数
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' [暂停]
Sub doPause()
Call processEvent(EVENT_PAUSE)
Call clearInterval(oIntervalRun)
End Sub
' [停止]
Sub doStop()
Call processEvent(EVENT_STOP)
Call clearInterval(oIntervalRun)
Call reset()
End Sub
' [开始]
Sub doStart()
execute id.code.value
If status = INITIALIZED Then enqueue(getStartUrls()) ' !!
Call processEvent(EVENT_START)
oIntervalRun = setInterval(getref("Run"),1000,"vbscript")
End Sub
' [例子]
Sub showExample()
id.code.value = id.examplecode.value
End Sub
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 任务
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 执行任务,该函数不停被调用一直到浏览器完成了下载工作
Sub Run()
execute id.code.value
While canCreateIE(cie.count)
Dim oIE
Set oIE = CreateObject("internetexplorer.application")
Call setIE(oIE)
Call cie.Add(cie.count,oIE)
Call cTaskDone.Add(cTaskDone.count,True)
Call cTaskNum.Add(cTaskNum.count,0)
Wend
Dim i
For i=0 To cTaskDone.count-1
If cTaskDone(i) Then
Dim url
url = dequeue()
If url = "" Then
If isAllTaskDone() Then
Call processEvent(EVENT_STOP)
Call clearInterval(oIntervalRun)
Call reset()
Call MsgBox("全部完成!")
End If
Else
On Error Resume Next
Call cie(i).navigate(url)
On Error Goto 0
cTaskDone(i) = False
cTaskNum(i) = tcount
Call Analyze(i)
End If
Else
Call Analyze(i)
End If
Next
End Sub
Sub task
End Sub
' 保存/分析网页
Sub Analyze(index)
Dim ieStat
On Error Resume Next
ieStat = cie(index).readystate
If Err.number<>0 Then Exit Sub
On Error Goto 0
If ieStat >= 3 Then
execute id.code.value
Dim doc
On Error Resume Next
Set doc = cie(index).Document
If Err.number<>0 Then Exit Sub
On Error goto 0
' 保存结果
If canSave(doc,cTaskNum(index),fcount,depth,regex) Then ' !!
Dim filename
filename = getFileName(doc,cTaskNum(index),fcount,depth,regex) ' !!
stream.type = 2
stream.mode = 3
Call stream.open()
Call stream.writetext(getContent(doc,cTaskNum(index),fcount,depth,regex)) ' !!
Call stream.savetofile(filename,2)
Call stream.close()
fcount = fcount + 1
End If
' 分析网址
If canAnalyze(doc,cTaskNum(index),fcount,depth,regex) Then ' !!
enqueue(pushUrls(doc,cTaskNum(index),fcount,depth,regex)) ' !!
End If
cTaskDone(index) = True
End If
End Sub
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 任务管理
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
' 状态 taskQueue depth depthCount 描述
' (0) "" 0 0 开始
' (1) "www.baidu.com" 1 1 加入了1级地址(1个)
' (2) "" 1 0 退出了1级地址(1个)
' (3) "www.sohu.com 2 2 加入了2级地址(2个)
' [url]www.sina.com[/url]"
' (4) "www.sina.com" 2 1 退出了2级地址(1个)
' (5) "www.sina.com
' [url]www.wcg.com[/url]" 2 0 加入了3级地址(1个)
' (6) "www.wcg.com" 2 1 退出了2级地址(1个)
' (7) "" 3 0 退出了3级地址(1个)
' 从上表可以看出:
' 1 退出的时候需要减depthCount
' 2 加入的时候如果depthCount=0,重新计算depthCount并且depth增1
' 更新深度
Sub updateDepth()
If depthCount = 0 Then
depth = depth + 1
depthCount = taskCount()
End If
End Sub
' 入队列
Sub enqueue(urls)
Dim arr,i
arr = Split(urls,vbcrlf)
For i=0 To UBound(arr)
If InStr(urlDone,arr(i)) = 0 And arr(i)<>"" Then
taskQueue = taskQueue & arr(i) & vbcrlf
urlDone = urlDone & arr(i) & vbcrlf
End If
Next
Call updateDepth()
End Sub
' 出队列
Function dequeue()
If depthCount > 0 Then
depthCount = depthCount - 1
End If
If taskQueue<>"" Then
Dim pos
pos = InStr(taskQueue,vbcrlf)
dequeue = Mid(taskQueue,1,pos-1)
taskQueue = Mid(taskQueue,pos+2)
tcount = tcount + 1
Else
dequeue = taskQueue
taskQueue = ""
End If
Call updateStatus()
End Function
</script>
</head>
<body scroll="no" language="vbscript" onload="call initialize()" onunload="call quit()" onresize="call resize()">
欢迎使用多线程网页下载脚本!请完成下列函数:<BR>
<TEXTAREA NAME="code">
' 得到以vbcrlf为分隔符的url作为开始分析/下载的地址
' 返回值类型:String
function getStartUrls()
end function
' depth startUrls()中的地址为1,下一级为2,以此类推
' 本程序使用广度优先遍历,会将新分析出的网页放到任务队列末尾
' 此函数确定是否分析当前页面
' 返回值类型:Boolean
function canAnalyze(doc,tnum,fcount,depth,regex)
end function
' 此函数将以vbcrlf为分隔符的url放入下载列表中
' 返回值类型:String
function pushUrls(doc,tnum,fcount,depth,regex)
end function
' 此函数确定是否保存此页面
' 返回值类型:Boolean
function canSave(doc,tnum,fcount,depth,regex)
end function
' 此函数返回需要保存的文件内容
' 返回值类型:String
function getContent(doc,tnum,fcount,depth,regex)
end function
' 此函数返回文件名
' 返回值类型:String
function getFileName(doc,tnum,fcount,depth,regex)
end function
' 是否继续增加ie
function canCreateIE(count)
end function
' 初始化ie属性
sub setIE(oIE)
end sub
</TEXTAREA>
<input type="text" id="status"><br>
<center>
<input id="start" type="button" onclick="vbscript: call doStart()" value="开始">
<input id="pause" type="button" onclick="vbscript: call doPause()" value="暂停">
<input id="stop" type="button" onclick="vbscript: call doStop()" value="停止">
<input id="example" type="button" onclick="vbscript: call showExample()" value="示例">
</center>
<TEXTAREA NAME="examplecode" style="visibility:hidden">
' 得到以vbcrlf为分隔符的url作为开始分析/下载的地址
' 返回值类型:String
function getStartUrls()
getStartUrls = "http://www.woyouxian.com/b06/b060401/of_human_bondage_cnindex.html"
end function
' depth startUrls()中的地址为1,下一级为2,以此类推
' 本程序使用广度优先遍历,会将新分析出的网页放到任务队列末尾
' 此函数确定是否分析当前页面
' 返回值类型:Boolean
function canAnalyze(doc,tnum,fcount,depth,regex)
if depth = 1 then
cananalyze = true
else
cananalyze = false
end if
end function
' 此函数将以vbcrlf为分隔符的url放入下载列表中
' 返回值类型:String
function pushUrls(doc,tnum,fcount,depth,regex)
regex.pattern = "/b06/b060401/of_human_bondage_[^.]+\.html"
set matches = regex.execute(doc.body.innerhtml)
for each match in matches
pushUrls = pushUrls & "http://www.woyouxian.com" & match & vbcrlf
next
end function
' 此函数确定是否保存此页面
' 返回值类型:Boolean
function canSave(doc,tnum,fcount,depth,regex)
cansave = true
end function
' 此函数返回需要保存的文件内容
' 返回值类型:String
function getContent(doc,tnum,fcount,depth,regex)
getcontent = doc.body.innertext
end function
' 此函数返回文件名
' 返回值类型:String
function getFileName(doc,tnum,fcount,depth,regex)
getfileName = tnum & ".txt"
end function
' 是否继续增加ie
function canCreateIE(count)
if count < 5 then
canCreateIE = true
else
canCreateIE = false
end if
end function
' 初始化ie属性
sub setIE(oIE)
oIE.visible = true
oIE.silent = true
end sub
</TEXTAREA>
</body>
</hta:application> 附开发文档: DHTML手册 Web开发手册(ie对象) VB语言手册 ADO手册(Stream对象)
[ Last edited by sonicandy on 2007-10-1 at 11:20 AM ]
|