Thread: Rss2Book
View Single Post
Old 01-17-2007, 02:19 AM   #85
fritz_the_blank
Member
fritz_the_blank began at the beginning.
 
Posts: 20
Karma: 10
Join Date: Jan 2007
Device: Sony PRS-500
@GeekRaver--

Thank you for replying. As for the first issue, try:

http://www.nytimes.com/services/xml/...t/HomePage.xml
Thanks again,


PS--I have written some code that scrapes all of the .xml files from a given page. Here is the code in case anyone should find it helpful:

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
<head>
<title>GetFeeds</title>

</head>
<%
Function GetHTML(strURL)
Dim objXMLHTTP, strReturn
Set objXMLHTTP = Server.CreateObject("MSXML2.ServerXMLHTTP")
objXMLHTTP.Open "GET", strURL, False
objXMLHTTP.Send
If Err <> 0 then
strReturn="Error"
Else
strReturn = objXMLHTTP.responseText
End If
Set objXMLHTTP = Nothing
GetHTML = strReturn
End Function

Function CleanURL(strURLText,strURL)
strStringTemp = Replace(strURLText,"href","",1,-1,1)
strStringTemp = Replace(strStringTemp,"=","",1,-1,1)
strStringTemp = Replace(strStringTemp,">","",1,-1,1)
If InStr(1,strStringTemp,"http:",1) < 1 Then
strStringTemp = strURL & "/" & strStringTemp
End If
strStringTemp = Replace(strStringTemp," ","",1,-1,1)
strStringTemp = Replace(strStringTemp,"""","",1,-1,1)
strStringTemp = Replace(strStringTemp,"""","",1,-1,1)
strStringTemp = Left(strStringTemp,8) & Replace(Right(strStringTemp,Len(strStringTemp)-8),"//","/")
CleanURL = strStringTemp
End Function

Sub findLinks(strPageToParse)
Set objRegExp = New RegExp
objRegExp.IgnoreCase = True
objRegExp.Global = True
objRegExp.Pattern = "]*?HREF\s*=\s*[""']?([^'"" >]+?)[ '""]?[^>]*?>"
Set colMatches = objRegExp.Execute(strPageToParse)

Dim intCounter
intCounter = 0
For Each itmMatch in colMatches
If InStr(1,itmMatch.value,".xml",1)>1 then
Response.write(CleanURL(itmMatch.value,strURL) & "<br />")
intCounter = intCounter + 1
If intCounter>999 Then
Exit For
End If
End If
Next
Set objRegExp = Nothing
Set objXMLHTTP = Nothing
End Sub

strURL = "http://www.nytimes.com/services/xml/rss/index.html"
strPageToParse = GetHTML(strURL)
Call findLinks(strPageToParse)
%>
<body>

</body>
</html>

FtB

Last edited by fritz_the_blank; 01-17-2007 at 02:22 AM.
fritz_the_blank is offline   Reply With Quote