Data extract from website URL

Posted by user2522395 on Stack Overflow See other posts from Stack Overflow or by user2522395
Published on 2013-06-26T03:25:53Z Indexed on 2013/07/01 4:21 UTC
Read the original article Hit count: 121

Filed under:
|
|
|

From this below script I am able to extract all links of particular website, But i need to know how I can generate data from extracted links especially like eMail, Phone number if its there Please help how i will modify the existing script and get the result or if you have full sample script please provide me.

Private Sub btnGo_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles btnGo.Click
    'url must be in this format: http://www.example.com/
    Dim aList As ArrayList = Spider("http://www.qatarliving.com", 1)
    For Each url As String In aList
        lstUrls.Items.Add(url)
    Next
End Sub
Private Function Spider(ByVal url As String, ByVal depth As Integer) As ArrayList
    'aReturn is used to hold the list of urls
    Dim aReturn As New ArrayList
    'aStart is used to hold the new urls to be checked
    Dim aStart As ArrayList = GrabUrls(url)
    'temp array to hold data being passed to new arrays
    Dim aTemp As ArrayList
    'aNew is used to hold new urls before being passed to aStart
    Dim aNew As New ArrayList
    'add the first batch of urls
    aReturn.AddRange(aStart)
    'if depth is 0 then only return 1 page
    If depth < 1 Then Return aReturn
    'loops through the levels of urls
    For i = 1 To depth
        'grabs the urls from each url in aStart
        For Each tUrl As String In aStart
            'grabs the urls and returns non-duplicates
            aTemp = GrabUrls(tUrl, aReturn, aNew)
            'add the urls to be check to aNew
            aNew.AddRange(aTemp)
        Next
        'swap urls to aStart to be checked
        aStart = aNew
        'add the urls to the main list
        aReturn.AddRange(aNew)
        'clear the temp array
        aNew = New ArrayList
    Next
    Return aReturn
End Function
Private Overloads Function GrabUrls(ByVal url As String) As ArrayList
    'will hold the urls to be returned
    Dim aReturn As New ArrayList
    Try
        'regex string used: thanks google
        Dim strRegex As String = "<a.*?href=""(.*?)"".*?>(.*?)</a>"
        'i used a webclient to get the source
        'web requests might be faster
        Dim wc As New WebClient
        'put the source into a string
        Dim strSource As String = wc.DownloadString(url)
        Dim HrefRegex As New Regex(strRegex, RegexOptions.IgnoreCase Or RegexOptions.Compiled)
        'parse the urls from the source
        Dim HrefMatch As Match = HrefRegex.Match(strSource)
        'used later to get the base domain without subdirectories or pages
        Dim BaseUrl As New Uri(url)
        'while there are urls
        While HrefMatch.Success = True
            'loop through the matches
            Dim sUrl As String = HrefMatch.Groups(1).Value
            'if it's a page or sub directory with no base url (domain)
            If Not sUrl.Contains("http://") AndAlso Not sUrl.Contains("www") Then
                'add the domain plus the page
                Dim tURi As New Uri(BaseUrl, sUrl)
                sUrl = tURi.ToString
            End If
            'if it's not already in the list then add it
            If Not aReturn.Contains(sUrl) Then aReturn.Add(sUrl)
            'go to the next url
            HrefMatch = HrefMatch.NextMatch
        End While
    Catch ex As Exception
        'catch ex here. I left it blank while debugging
    End Try

    Return aReturn
End Function
Private Overloads Function GrabUrls(ByVal url As String, ByRef aReturn As ArrayList, ByRef aNew As ArrayList) As ArrayList
    'overloads function to check duplicates in aNew and aReturn
    'temp url arraylist
    Dim tUrls As ArrayList = GrabUrls(url)
    'used to return the list
    Dim tReturn As New ArrayList
    'check each item to see if it exists, so not to grab the urls again
    For Each item As String In tUrls
        If Not aReturn.Contains(item) AndAlso Not aNew.Contains(item) Then
            tReturn.Add(item)
        End If
    Next
    Return tReturn
End Function

© Stack Overflow or respective owner

Related posts about c#

Related posts about vb.net