Sunday, August 05, 2007

HTML Parser for VBA

'I successfully ran this program using Microsoft Excel to
'parse some html files and extract information from them
'into a text file

Option Explicit

'This program parses through html files
'and copies only the information of interest to a new file
Sub EditTheFiles()
Dim FSO As FileSystemObject
'need reference to Microsoft Scripting Runtime
Dim SourceFolder As Scripting.Folder, SubFolder As Scripting.Folder
Dim FileItem As Scripting.file
Dim SourceFolderName As String
SourceFolderName = "C:\Folder\SubFolder"
'SourceFolderName = BrowseFolder("Select A Folder")
Dim strLineParse As String
Dim bolKeepTheLine As Boolean
Dim strGet As String
Dim i As Long
Dim j As Long
Dim dblFileLength As Double
'a running variable that counts the number of bytes
'we've parsed; use double because it might get big

Set FSO = New Scripting.FileSystemObject
Set SourceFolder = FSO.GetFolder(SourceFolderName)

On Error Resume Next
Kill (SourceFolderName & "\Output_File.txt") 'start afresh

Open SourceFolderName & "\Output_File.txt" For Output As #2
'we only use one file to hold all the important information
'from all the files in the directory
For Each FileItem In SourceFolder.Files
'it seems to order the files by date/time
'with earliest first
If Right(FileItem.Name, 5) = ".html" Then

Open FileItem.Path For Binary As #1
'we use binary for the "Get" statement

'some initilizations
bolKeepTheLine = False
'by default, don't add the lines
'to our new file
dblFileLength = 0 'byte number
i = 0 'line number
j = 0
strLineParse = ""

While dblFileLength < LOF(1)
'LOF gives the size of the file in bytes
strLineParse = ""
'start it off with nothing so the first
'character will concatenate correctly
strGet = "A"
'dummy value to start the loop

'the html files we are parsing don't have 0D 0A
'at the end of the lines so we can't use the
'Line Input statment;
'they only have 0A at the ends, so we have to
'make our own "Line Input" statement
While Asc(strGet) <> 10 And dblFileLength < LOF(1)
'added the 2nd condition to make sure
'the loop ends
Get #1, , strGet
'runs through the file byte by byte
dblFileLength = dblFileLength + 1
strLineParse = strLineParse & strGet
Wend
strLineParse = Left(strLineParse, _
Len(strLineParse) - 1)
'take out the last LF character
i = i + 1

'in our case, the stuff we want to keep
'is two lines below this text
'so if we find this in the old file,
'go ahead and edit the new file
If strLineParse = "BEGIN TEXT" Then
'marks the beginning of our information
j = i + 2
'we don't start keeping the lines till two
'lines down
Print #2, ""
Print #2, ""
Print #2, Left(FileItem.Name, _
Len(FileItem.Name) - 5)
Print #2, ""
End If

If i = j Then
bolKeepTheLine = True
End If

If strLineParse = "END TEXT" Then
'marks the end of our information
bolKeepTheLine = False
End If

If bolKeepTheLine Then
Print #2, strLineParse
End If
Wend 'get next line in file
Close #1
End If
Next FileItem 'get the next file in the source directory
Close #2

'housekeeping
Set FileItem = Nothing
Set SourceFolder = Nothing
Set FSO = Nothing

End Sub

Labels:

5 Comments:

Blogger mathfridge said...

I just figured I better increase the average on comments posted. Oh a good job noticing the futurama quote in my second blog.

9/05/2007 10:44:00 PM  
Blogger Nic said...

I believe that you want to use type "Long" for "dblFileLength" rather than "Double". That is, you want a long integer, not a double-precision floating point number.

9/27/2007 10:41:00 PM  
Blogger jigawatt said...

Yeah, a long would probably be sufficient here. If I was using this to work on files bigger than 2GB, I'd need to rewrite it in something other than VBA.

9/28/2007 07:19:00 AM  
Blogger Alison said...

Excellent script. Worked for me.

Thanks

11/28/2008 02:35:00 AM  
Blogger jigawatt said...

Alison,

I'd forgotton about this post. Glad that it's helped you out.

Maybe I'll do some more of these kinds of posts.

11/28/2008 09:58:00 AM  

Post a Comment

<< Home