webcrawling - Developer IT

Asynchronous Webcrawling F#, something wrong ?

- by jlezard

Not quite sure if it is ok to do this but, my question is: Is there something wrong with my code ? It doesn't go as fast as I would like, and since I am using lots of async workflows maybe I am doing something wrong. The goal here is to build something that can crawl 20 000 pages in less than an hour. open System open System.Text open System.Net open System.IO open System.Text.RegularExpressions open System.Collections.Generic open System.ComponentModel open Microsoft.FSharp open System.Threading //This is the Parallel.Fs file type ComparableUri ( uri: string ) = inherit System.Uri( uri ) let elts (uri:System.Uri) = uri.Scheme, uri.Host, uri.Port, uri.Segments interface System.IComparable with member this.CompareTo( uri2 ) = compare (elts this) (elts(uri2 :?> ComparableUri)) override this.Equals(uri2) = compare this (uri2 :?> ComparableUri ) = 0 override this.GetHashCode() = 0 ///////////////////////////////////////////////Funtions to retreive html string////////////////////////////// let mutable error = Set.empty<ComparableUri> let mutable visited = Set.empty<ComparableUri> let getHtmlPrimitiveAsyncDelay (delay:int) (uri : ComparableUri) = async{ try let req = (WebRequest.Create(uri)) :?> HttpWebRequest // 'use' is equivalent to ‘using’ in C# for an IDisposable req.UserAgent<-"Mozilla" //Console.WriteLine("Waiting") do! Async.Sleep(delay * 250) let! resp = (req.AsyncGetResponse()) Console.WriteLine(uri.AbsoluteUri+" got response after delay "+string delay) use stream = resp.GetResponseStream() use reader = new StreamReader(stream) let html = reader.ReadToEnd() return html with | _ as ex -> Console.WriteLine( ex.ToString() ) lock error (fun () -> error<- error.Add uri ) lock visited (fun () -> visited<-visited.Add uri ) return "BadUri" } ///////////////////////////////////////////////Active Pattern Matching to retreive href////////////////////////////// let (|Matches|_|) (pat:string) (inp:string) = let m = Regex.Matches(inp, pat) // Note the List.tl, since the first group is always the entirety of the matched string. if m.Count > 0 then Some (List.tail [ for g in m -> g.Value ]) else None let (|Match|_|) (pat:string) (inp:string) = let m = Regex.Match(inp, pat) // Note the List.tl, since the first group is always the entirety of the matched string. if m.Success then Some (List.tail [ for g in m.Groups -> g.Value ]) else None ///////////////////////////////////////////////Find Bad href////////////////////////////// let isEmail (link:string) = link.Contains("@") let isMailto (link:string) = if Seq.length link >=6 then link.[0..5] = "mailto" else false let isJavascript (link:string) = if Seq.length link >=10 then link.[0..9] = "javascript" else false let isBadUri (link:string) = link="BadUri" let isEmptyHttp (link:string) = link="http://" let isFile (link:string)= if Seq.length link >=6 then link.[0..5] = "file:/" else false let containsPipe (link:string) = link.Contains("|") let isAdLink (link:string) = if Seq.length link >=6 then link.[0..5] = "adlink" elif Seq.length link >=9 then link.[0..8] = "http://adLink" else false ///////////////////////////////////////////////Find Bad href////////////////////////////// let getHref (htmlString:string) = let urlPat = "href=\"([^\"]+)" match htmlString with | Matches urlPat urls -> urls |> List.map( fun href -> match href with | Match (urlPat) (link::[]) -> link | _ -> failwith "The href was not in correct format, there was more than one match" ) | _ -> Console.WriteLine( "No links for this page" );[] |> List.filter( fun link -> not(isEmail link) ) |> List.filter( fun link -> not(isMailto link) ) |> List.filter( fun link -> not(isJavascript link) ) |> List.filter( fun link -> not(isBadUri link) ) |> List.filter( fun link -> not(isEmptyHttp link) ) |> List.filter( fun link -> not(isFile link) ) |> List.filter( fun link -> not(containsPipe link) ) |> List.filter( fun link -> not(isAdLink link) ) let treatAjax (href:System.Uri) = let link = href.ToString() let firstPart = (link.Split([|"#"|],System.StringSplitOptions.None)).[0] new Uri(firstPart) //only follow pages with certain extnsion or ones with no exensions let followHref (href:System.Uri) = let valid2 = set[".py"] let valid3 = set[".php";".htm";".asp"] let valid4 = set[".php3";".php4";".php5";".html";".aspx"] let arrLength = href.Segments |> Array.length let lastExtension = (href.Segments).[arrLength-1] let lengthLastExtension = Seq.length lastExtension if (lengthLastExtension <= 3) then not( lastExtension.Contains(".") ) else //test for the 2 case let last4 = lastExtension.[(lengthLastExtension-1)-3..(lengthLastExtension-1)] let isValid2 = valid2|>Seq.exists(fun validEnd -> last4.EndsWith( validEnd) ) if isValid2 then true else if lengthLastExtension <= 4 then not( last4.Contains(".") ) else let last5 = lastExtension.[(lengthLastExtension-1)-4..(lengthLastExtension-1)] let isValid3 = valid3|>Seq.exists(fun validEnd -> last5.EndsWith( validEnd) ) if isValid3 then true else if lengthLastExtension <= 5 then not( last5.Contains(".") ) else let last6 = lastExtension.[(lengthLastExtension-1)-5..(lengthLastExtension-1)] let isValid4 = valid4|>Seq.exists(fun validEnd -> last6.EndsWith( validEnd) ) if isValid4 then true else not( last6.Contains(".") ) && not(lastExtension.[0..5] = "mailto") //Create the correct links / -> add the homepage , make them a comparabel Uri let hrefLinksToUri ( uri:ComparableUri ) (hrefLinks:string list) = hrefLinks |> List.map( fun link -> try if Seq.length link <4 then Some(new Uri( uri, link )) else if link.[0..3] = "http" then Some(new Uri(link)) else Some(new Uri( uri, link )) with | _ as ex -> Console.WriteLine(link); lock error (fun () ->error<-error.Add uri) None ) |> List.filter( fun link -> link.IsSome ) |> List.map( fun o -> o.Value) |> List.map( fun uri -> new ComparableUri( string uri ) ) //Treat uri , removing ajax last part , and only following links specified b Benoit let linksToFollow (hrefUris:ComparableUri list) = hrefUris |>List.map( treatAjax ) |>List.filter( fun link -> followHref link ) |>List.map( fun uri -> new ComparableUri( string uri ) ) |>Set.ofList let needToVisit uri = ( lock visited (fun () -> not( visited.Contains uri) ) ) && (lock error (fun () -> not( error.Contains uri) )) let getLinksToFollowAsyncDelay (delay:int) ( uri: ComparableUri ) = async{ let! links = getHtmlPrimitiveAsyncDelay delay uri lock visited (fun () ->visited<-visited.Add uri) let linksToFollow = getHref links |> hrefLinksToUri uri |> linksToFollow |> Set.filter( needToVisit ) |> Set.map( fun link -> if uri.Authority=link.Authority then link else link ) return linksToFollow } //Add delays if visitng same authority let getDelay(uri:ComparableUri) (authorityDelay:Dictionary<string,int>) = let uriAuthority = uri.Authority let hasAuthority,delay = authorityDelay.TryGetValue(uriAuthority) if hasAuthority then authorityDelay.[uriAuthority] <-delay+1 delay else authorityDelay.Add(uriAuthority,1) 0 let rec getLinksToFollowFromSetAsync maxIteration ( uris: seq<ComparableUri> ) = let authorityDelay = Dictionary<string,int>() if maxIteration = 100 then Console.WriteLine("Finished") else //Unite by authority add delay for those we same authority others ignore let stopwatch= System.Diagnostics.Stopwatch() stopwatch.Start() let newLinks = uris |> Seq.map( fun uri -> let delay = lock authorityDelay (fun () -> getDelay uri authorityDelay ) getLinksToFollowAsyncDelay delay uri ) |> Async.Parallel |> Async.RunSynchronously |> Seq.concat stopwatch.Stop() Console.WriteLine("\n\n\n\n\n\n\nTimeElapse : "+string stopwatch.Elapsed+"\n\n\n\n\n\n\n\n\n") getLinksToFollowFromSetAsync (maxIteration+1) newLinks getLinksToFollowFromSetAsync 0 (seq[ComparableUri( "http://twitter.com/" )]) Console.WriteLine("Finished") Some feedBack would be great ! Thank you (note this is just something I am doing for fun)

Search Results

Search found 24 results on 1 pages for 'webcrawling'.

Page 1/1 | 1

- by jlezard

- by Jojo

- by Bigtwinz

- by Sakin

- by OverTheRainbow

- by DrDee

- by AlgoMan

- by Skoder

- by jlezard

- by Gautam

- by Joshua Frank

- by DrDee

- by Glenn Slaven

- by Michael Robinson

- by jlezard

- by Harry

- by goh

- by zahir hussain

- by Alex

- by gautam

- by bohohasdhfasdf

- by Richard

- by sirrocco

- by Anonymity