So, after I posted that piece last night about wanting a set of libraries to make C# (or VB.NET) as quick and easy to write spidering apps as Perl is, I started coding. I'll set up a full GotDotNet Workspace a little later with all the code nicely bundled up with it's full set of unit tests, but I've got to share this here.
The idea was to be able to write a very short program to download all the videos I've not got yet from Channel 9's website. The final app looks like this using System;
using System.IO;
using DotNetScriptLib.Web;
using DotNetScriptLib.Mail;
namespace DownloadChannel9Videos
{
class Class1
{
[STAThread]
static void Main(string[] args)
{
bool newVids = false;
string path = Environment.GetFolderPath(System.Environment.SpecialFolder.MyPictures);
WebPage channel9 = new WebPage("http://channel9.msdn.com/ShowForum.aspx?ForumID=14");
foreach (WebPageAnchor videoLink in channel9.GrabAnchorsByContent("[Save]"))
{
if ( !File.Exists(path + "\\" + videoLink.FileName) )
{
channel9.SaveToDisk(videoLink, path + "\\" + videoLink.FileName);
newVids = true;
}
}
}
}
}
As you can see, I succeeded in keeping it small. When you take into account that Visual Studio generates the class def, the Main() method signature and other stuff, my app only takes 7 lines of code to hit the site and grab everything I've not already got. In fact, it only really uses 5 lines of code since two of the lines in there apply to updating a boolean newVids flag that I'm not currently using. That's actually going to come into play when I get DotNetScript working with email support; if new videos are downloaded, an email will be sent to me telling me so.
DotNetScriptLib at the moment is very very small. It just has capabilities to download a web page, interrogate and locate links in that page, and download files. The idea now though is that as soon as I get the inkling to fire up Perl to do something, I'll add to DotNetScriptLib instead. If all goes to plan I'll eventually end up with a set of libraries that turn web spidering, file trawling and other goodies into fantastically trivial C# or VB apps.
Here's the two other main classes in the app (they are of course in their own assembly, and yes they do have unit tests - but you have to wait until I get my arse in gear and set up a workspace before you can see all that good stuff).
Incidentally, before I dump a bunch of code in here - anyone got any bright ideas on how to write a unit test to verifiy that a mail got sent?
Here's the code
WebPage.cs using System;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
namespace DotNetScriptLib.Web
{
public class WebPage
{
private ArrayList _links = null;
public string Source;
public WebPage( string url )
{
DownloadWebPage(url);
}
public Object[] GrabAnchorsByContent(string content )
{
ExtractLinks();
if ( _links != null )
{
ArrayList results = new ArrayList();
foreach (WebPageAnchor link in _links )
{
if ( link.Content == content )
results.Add(link);
}
return results.ToArray();
}
else return null;
}
public void SaveToDisk( WebPageAnchor link, string filename )
{
WebClient client = new WebClient();
client.DownloadFile( link.URL, filename);
}
private void ExtractLinks()
{
if ( _links != null )
return;
string regex =
"([^<]+|.*?)?<\/a>";
System.Text.RegularExpressions.RegexOptions options = (
(System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace |
System.Text.RegularExpressions.RegexOptions.Multiline)
| System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex reg = new System.Text.RegularExpressions.Regex(regex, options);
MatchCollection matches = reg.Matches(this.Source);
if ( matches.Count > 0 )
{
_links = new ArrayList();
foreach (Match match in matches )
{
WebPageAnchor anchor = new WebPageAnchor(match.Groups["1"].Value, match.Groups["2"].Value);
_links.Add( anchor );
}
}
else
{
_links = null;
}
}
private void DownloadWebPage( string url )
{
WebRequest req = WebRequest.Create(url);
using(StreamReader sr = new StreamReader( req.GetResponse().GetResponseStream() ))
this.Source = sr.ReadToEnd();
}
}
}
WebPageAnchor.cs using System;
using System.Text.RegularExpressions;
namespace DotNetScriptLib.Web
{
public class WebPageAnchor
{
private string _url;
private string _content;
public string URL { get { return _url; } }
public string Content { get { return _content; } }
public string FileName
{
get
{
string result= "";
if ( _url.Length != 0 )
{
string regex = ".*/(.*[.][a-zA-Z0-9]{3})\r\n";
System.Text.RegularExpressions.RegexOptions options = ((System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace | System.Text.RegularExpressions.RegexOptions.Multiline) | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Regex reg = new System.Text.RegularExpressions.Regex(regex, options);
Match match = reg.Match(_url);
if ( match.Success )
result = match.Groups[1].Value;
}
return result;
}
}
internal WebPageAnchor( string url, string content )
{
_url = url;
_content = content;
}
}
}
9:17:08 PM
|