Imports System.IO
Imports System.Net
Imports System.Text.RegularExpressions
Public Class Form1
Private Sub btnGo_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
'url must be in this format: http://www.example.com/
Dim blist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=11&FORM=PERE", 0)
Dim clist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=21&FORM=PERE", 0)
Dim dlist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=31&FORM=PERE", 0)
Dim elist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=41&FORM=PERE", 0)
Dim flist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=51&FORM=PERE", 0)
Dim glist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=61&FORM=PERE", 0)
Dim hlist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=71&FORM=PERE", 0)
Dim ilist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=81&FORM=PERE", 0)
Dim jlist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=91&FORM=PERE", 0)
Dim klist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=101&FORM=PERE", 0)
Dim llist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=111&FORM=PERE", 0)
Dim mlist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=121&FORM=PERE", 0)
Dim nlist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=131&FORM=PERE", 0)
Dim olist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=141&FORM=PERE", 0)
Dim plist As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text + "&first=151&FORM=PERE", 0)
Dim aList As ArrayList = Spider("http://www.bing.com/search?q=" + TextBox1.Text, 0)
For Each url As String In blist
ListBox1.Items.Add(url)
Next
For Each url As String In aList
ListBox1.Items.Add(url)
Next
For Each url As String In clist
ListBox1.Items.Add(url)
Next
For Each url As String In dlist
ListBox1.Items.Add(url)
Next
For Each url As String In elist
ListBox1.Items.Add(url)
Next
For Each url As String In dlist
ListBox1.Items.Add(url)
Next
For Each url As String In elist
ListBox1.Items.Add(url)
Next
For Each url As String In flist
ListBox1.Items.Add(url)
Next
For Each url As String In glist
ListBox1.Items.Add(url)
Next
For Each url As String In hlist
ListBox1.Items.Add(url)
Next
For Each url As String In ilist
ListBox1.Items.Add(url)
Next
For Each url As String In jlist
ListBox1.Items.Add(url)
Next
For Each url As String In klist
ListBox1.Items.Add(url)
Next
For Each url As String In llist
ListBox1.Items.Add(url)
Next
For Each url As String In mlist
ListBox1.Items.Add(url)
Next
For Each url As String In nlist
ListBox1.Items.Add(url)
Next
For Each url As String In olist
ListBox1.Items.Add(url)
Next
For Each url As String In plist
ListBox1.Items.Add(url)
Next
End Sub
Private Function Spider(ByVal url As String, ByVal depth As Integer) As ArrayList
'aReturn is used to hold the list of urls
Dim aReturn As New ArrayList
'aStart is used to hold the new urls to be checked
Dim aStart As ArrayList = GrabUrls(url)
'temp array to hold data being passed to new arrays
Dim aTemp As ArrayList
'aNew is used to hold new urls before being passed to aStart
Dim aNew As New ArrayList
'add the first batch of urls
aReturn.AddRange(aStart)
'if depth is 0 then only return 1 page
If depth < 1 Then Return aReturn
'loops through the levels of urls
For i = 1 To depth
'grabs the urls from each url in aStart
For Each tUrl As String In aStart
'grabs the urls and returns non-duplicates
aTemp = GrabUrls(tUrl, aReturn, aNew)
'add the urls to be check to aNew
aNew.AddRange(aTemp)
Next
'swap urls to aStart to be checked
aStart = aNew
'add the urls to the main list
aReturn.AddRange(aNew)
'clear the temp array
aNew = New ArrayList
Next
Return aReturn
End Function
Private Overloads Function GrabUrls(ByVal url As String) As ArrayList
'will hold the urls to be returned
Dim aReturn As New ArrayList
Try
'regex string used: thanks google 'Dim strRegex As String = ".*?<a.*?href=""(.*?)""(.*?)"".*?>(.*?)</a>"' bu kodda yarar iyi buda
Dim strRegex As String = "<h2><a.*?href=""(.*?)"".*?>(.*?)</a></h2"
'i used a webclient to get the source
'web requests might be faster
Dim wc As New WebClient
'put the source into a string
Dim strSource As String = wc.DownloadString(url)
Dim HrefRegex As New Regex(strRegex, RegexOptions.IgnoreCase Or RegexOptions.Compiled)
'parse the urls from the source
Dim HrefMatch As Match = HrefRegex.Match(strSource)
'used later to get the base domain without subdirectories or pages
Dim BaseUrl As New Uri(url)
'while there are urls
While HrefMatch.Success = True
'loop through the matches
Dim sUrl As String = HrefMatch.Groups(1).Value
'if it's a page or sub directory with no base url (domain)
If Not sUrl.Contains("http://") AndAlso Not sUrl.Contains("www") Then
'add the domain plus the page
Dim tURi As New Uri(BaseUrl, sUrl)
sUrl = tURi.ToString
End If
'if it's not already in the list then add it
If Not aReturn.Contains(sUrl) Then aReturn.Add(sUrl)
'go to the next url
HrefMatch = HrefMatch.NextMatch
End While
Catch ex As Exception
'catch ex here. I left it blank while debugging
End Try
Return aReturn
End Function
Private Overloads Function GrabUrls(ByVal url As String, ByRef aReturn As ArrayList, ByRef aNew As ArrayList) As ArrayList
'overloads function to check duplicates in aNew and aReturn
'temp url arraylist
Dim tUrls As ArrayList = GrabUrls(url)
'used to return the list
Dim tReturn As New ArrayList
'check each item to see if it exists, so not to grab the urls again
For Each item As String In tUrls
If Not aReturn.Contains(item) AndAlso Not aNew.Contains(item) Then
tReturn.Add(item)
End If
Next
Return tReturn
End Function
Private Sub Form1_Load(sender As Object, e As EventArgs) Handles MyBase.Load
End Sub
Private Sub Button2_Click(sender As Object, e As EventArgs)
ListBox1.Items.Clear()
End Sub
End Class