Scraper – Custom .Net Class for HTML/WebBrowser Document scraping and automation
Hey All,
Noticed a lot of threads on forums lately asking how to “find hyperlinks” or “click a form button on a web page”. I’ve been working on a Class to make this easier for not only myself, but hopefully others as well. While it’s still very humble in it’s functionality at the moment, I’m continuing to add to it to increase functionality and ease of use.
I’m very open to suggestions on speed increases, efficiency increases or any other comments you may have.
This is something I use all the time and I know it saves me a lot of time. Hopefully others can benefit from it as well.
Screenshot of it in use and Source code (also .vb file) below:
Download the .vb Class File: Scraper.zip | Scraper Demo
''' <summary>
''' Scrape Class: This class allows easy scraping of Web pages and allows interaction with buttons and inputs.
''' Coded by: Steve Hanz of www.stateofidleness.com
''' Date: 12-17-2010
''' </summary>
''' <remarks>Free to use for private or commercial use. I wouldn't turn down a link to my site though *wink*</remarks>
Public Class Scraper
''' <summary>
''' Returns a List(Of String) containing the elements it found based on the parameter filters
''' </summary>
''' <param name="wb">A WebBrowser Object to search</param>
''' <param name="element">The element tag to search for ("input", "a", "img" etc)</param>
''' <param name="AttributeFilter">The element attribute to filter by ("class", "name", "id", etc)</param>
''' <param name="TextFilter">The text to filter results by</param>
''' <param name="ReturnedAttributeFilter">Returns only the attribute text for the supplied attribute</param>
''' <param name="ReturnInnerText">Boolean indicating whether to also return the InnerText of each element</param>
''' <returns>Returns a List(Of String) containing the elements it found based on the parameter filters</returns>
''' <remarks></remarks>
Public Function ListElements( _
ByVal wb As WebBrowser, _
ByVal element As String, _
Optional ByVal AttributeFilter As String = "id", _
Optional ByVal TextFilter As String = Nothing, _
Optional ByVal ReturnedAttributeFilter As String = Nothing, _
Optional ByVal ReturnInnerText As Boolean = False) As List(Of String)
Dim ScrapedData As New List(Of String)
Dim theElementCollection As HtmlElementCollection = wb.Document.GetElementsByTagName(element)
For Each curElement As HtmlElement In theElementCollection
If String.IsNullOrEmpty(TextFilter) Then 'No specific filter to search for
If String.IsNullOrEmpty(ReturnedAttributeFilter) Then
ScrapedData.Add(curElement.OuterHtml) 'Give them everything
If ReturnInnerText Then
If String.IsNullOrEmpty(curElement.InnerText) Then
ScrapedData.Add("BLANK")
Else
ScrapedData.Add(curElement.InnerText) 'Give them the actual text if they wanted it
End If
End If
Else
If Not String.IsNullOrEmpty(curElement.GetAttribute(ReturnedAttributeFilter)) Then
ScrapedData.Add(curElement.GetAttribute(ReturnedAttributeFilter)) 'Give them the attribute they requested
End If
End If
ElseIf curElement.GetAttribute(AttributeFilter).ToLower.Contains(TextFilter) Then 'Searching for something specific
ScrapedData.Add(curElement.OuterHtml) 'Give them everything that matches
If ReturnInnerText Then
If String.IsNullOrEmpty(curElement.InnerText) Then
ScrapedData.Add("BLANK")
Else
ScrapedData.Add(curElement.InnerText) 'Give them the actual text if they wanted it
End If
End If
End If
Next
Return ScrapedData
End Function
''' <summary>
''' Returns a Boolean indicating whether or not a keyword, or keywords was found in the current URL's Document text.
''' </summary>
''' <param name="wb">A WebBrowser Object to search</param>
''' <param name="Keywords">A List(Of String) of keyword(s) to search for in the current Document</param>
''' <returns>Returns a Boolean indicating whether or not a keyword, or keywords was found in the current URL's Document text.</returns>
''' <remarks></remarks>
Public Function FindKeywords( _
ByVal wb As WebBrowser, _
ByVal Keywords As List(Of String)) As Boolean
Dim WasFound As Boolean = False
Dim theElementCollection As HtmlElementCollection = wb.Document.Body.GetElementsByTagName("p")
For Each curElement As HtmlElement In theElementCollection
For Each item In Keywords
If Not String.IsNullOrEmpty(curElement.OuterText) Then
If curElement.OuterHtml.Contains(item) Then
WasFound = True
Return WasFound
Exit For
End If
End If
Next
Next
Return WasFound
End Function
''' <summary>
''' Set the text of an Input element in the current document.
''' </summary>
''' <param name="wb">A WebBrowser object to search</param>
''' <param name="InputNameOrID">The ID or Name of the element to alter</param>
''' <param name="Value">The new value to set for the element</param>
''' <remarks></remarks>
Public Sub SetInputText( _
ByVal wb As WebBrowser, _
ByVal InputNameOrID As String, _
ByVal Value As String)
Dim theElementCollection As HtmlElementCollection = wb.Document.All
For Each curElement As HtmlElement In theElementCollection
If curElement.GetAttribute("id") = InputNameOrID Or curElement.GetAttribute("name") = InputNameOrID Then
curElement.SetAttribute("value", Value)
End If
Next
End Sub
''' <summary>
''' Set the text of a Textarea element in the current document.
''' </summary>
''' <param name="wb">A WebBrowser object to search</param>
''' <param name="InputNameOrID">The ID or Name of the element to alter</param>
''' <param name="Value">The new value to set for the element</param>
''' <remarks></remarks>
Public Sub SetTextareaText( _
ByVal wb As WebBrowser, _
ByVal InputNameOrID As String, _
ByVal Value As String)
Dim theElementCollection As HtmlElementCollection = wb.Document.All
For Each curElement As HtmlElement In theElementCollection
If curElement.GetAttribute("id") = InputNameOrID Or curElement.GetAttribute("name") = InputNameOrID Then
curElement.InnerText = Value
End If
Next
End Sub
''' <summary>
''' Invoke action on a Button element in the current document.
''' </summary>
''' <param name="wb">A WebBrowser object to search</param>
''' <param name="InputNameOrID">The ID or Name of the element to alter</param>
''' <param name="EventToTrigger">The event to trigger</param>
''' <remarks>EventToTrigger examples: "click", "onfocus", "onclick", etc</remarks>
Public Sub ClickButton( _
ByVal wb As WebBrowser, _
ByVal InputNameOrID As String, _
ByVal EventToTrigger As String)
Dim theElementCollection As HtmlElementCollection = wb.Document.GetElementsByTagName("input")
For Each curElement As HtmlElement In theElementCollection
If curElement.GetAttribute("id") = InputNameOrID Or curElement.GetAttribute("name") = InputNameOrID Then
curElement.InvokeMember(EventToTrigger)
End If
Next
End Sub
End Class
Comments Welcome.

I like this post, thank you…
I am trying to use your class but i have been getting this NULL exception during runtime .. please can you specify where i am going wrong ..
My code :
Private Sub Btn_Go_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Btn_Go.Click
WebBrowser1.Navigate(New Uri(tb_URL.Text))
Dim sc As Scraper
sc = New Scraper()
Dim i As String
Dim list As List(Of String)
list = sc.ListElements(WebBrowser1, “img”)
i = 0
For Each i In list
RichTextBox1.AppendText(i)
Next
End Sub
What’s most likely happening is you’re attempting to perform the scraping immediately after the .Navigate method. The page hasn’t fully loaded (or even exists yet), which is why when you try to scrape it, it’s saying there’s not document to scrape (null reference)
You might also modify the loop to indicate that each “i” is a String.
For Each i As String In List
Try placing your scraping code (the line starting after the .Navigate) in the WebBrowser’s DocumentCompleted event. You might check out my post about detecting a fully loaded page as well as you’ll most likely encounter that same problem I talk about in the post.
Thanks for trying it out!
Trying out your Scraper demo. The Scrape Link InnerText button causes a hangup.
VB2010, Win 7 64