F# -- Grab Web page

宗政文彬
2023-12-01
open System
open System.Diagnostics
open System.Net
open System.Xml
open System.IO
//open HtmlAgilityPack   
let asyncGrapUrl(newUrl : string) =
    async{            
        let fileName = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml"
        let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest
        let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync())
        let responStream = httpRespon.GetResponseStream()

//        let xml = new XmlDocument()
//        xml.Load(responStream)

        let fileStream = new System.IO.FileStream(fileName,FileMode.OpenOrCreate,FileAccess.Write)
            
        let streamWr = new StreamWriter(fileStream,Text.Encoding.GetEncoding("GB2312"))
        use strd =new StreamReader(responStream)

        while(not strd.EndOfStream ) do            
            streamWr.WriteLine(strd.ReadLine())
            streamWr.Flush()
        
        
        fileStream.Close() 
        responStream.Close()
        return fileName//,xml)
    } |> Async.RunSynchronously   
     
let main() =
    let url = @"http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/sci/tech/rss.xml"
    let asyncResults = asyncGrapUrl(url)
    let filename =  asyncResults
    let xml = new XmlDocument()//snd asyncResults
    let fileInfo = new System.IO.FileStream(filename,FileMode.Open,FileAccess.Read)  
    let fileStr = new StreamReader(fileInfo,Text.Encoding.GetEncoding("GB2312"))
    xml.Load(fileStr)

    let nodes = xml.SelectNodes("/rss/channel/item/title")

    for i in 0..(nodes.Count - 1) do
        printfn "%d : %s" (i + 1) nodes.[i].InnerText 

    let item = int(Console.ReadLine())
    let newUrl =
        let xpath = sprintf "/rss/channel/item[%i]/link" item
        let node = xml.SelectSingleNode(xpath)
        node.InnerText
    let proStart = new ProcessStartInfo(UseShellExecute=true,FileName=newUrl)
    let proc = new Process()
    proc.StartInfo <- proStart
    proc.Start() |> ignore
    asyncGrapUrl(newUrl) |> ignore

main()

目前还没有完善, 中文乱码。。 在英文系统下, 没有实现抓取正文,只获取全部源代码。

转载于:https://www.cnblogs.com/FsharpZack/archive/2013/01/04/2844647.html

 类似资料: