[C#][WinForm]如何清除特定論壇網頁"毛毛蟲"
這天MSN那頭傳來,有辦法清除網頁上的"毛毛蟲"嗎??毛毛蟲搞得複製文章很痛苦,心想是在說啥??算了還是看看網站才能明白他在說什麼??
瀏覽網頁看似很正常
瀏覽HTML就會發現"毛毛蟲"
直接複製後,我就可以體會他的痛苦了,看來這是論壇用來防止盜連文章的一種手法
就在看HTML碼同時,Feel大哥又來了~~~動手把!孩子
快速作法利用webbrowser+.NET RegularExpressions應該就可以過濾那些"毛毛蟲"
為了取得shell的windows須先加入參考(Microsoft Internet Controls)
SHDocVw.ShellWindows SW = new SHDocVw.ShellWindows();
private void GetShellWindow()
{
foreach (SHDocVw.InternetExplorer ie in SW)
{
string shellwinname;
shellwinname= ie.FullName.ToLower();
//判斷IE視窗
if (shellwinname.IndexOf("iexplore") > 0)
{
listBox1.Items.Add(ie.LocationURL);
}
}
}
listbox_selectedindexchanged事件
private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
{
if (listBox1.SelectedIndex < -1 || listBox1.SelectedItem==null)
{
MessageBox.Show("請點選網址");
return;
}
else
{
WebBrowser wb = new WebBrowser();
g_done = false;//先前宣告的Global變數
g_html = "";//先前宣告的Global變數
wb.Navigate(listBox1.SelectedItem.ToString());
//委派DocumentCompleted事件
wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
while (!done)
{
Application.DoEvents();
System.Threading.Thread.Sleep(10);//維持UI介面
}
if (wb.Url.ToString().Contains("xxx.com/forums/"))
{
textBox1.Text = logic.Utility.ParserHtml(html);
}
else
textBox1.Text = html;
}
wb.Dispose();
}
wb_DocumentCompleted事件
private void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
//判斷論壇網址
if (wb.Url.ToString().Contains("xxx.com/forums/"))
{
html = wb.Document.Body.InnerHtml;
}
else
{
//一般網頁也就不用paserhtml
html = wb.Document.Body.InnerText;
}
done = true;
}
logic.Utility.ParserHtml class
public static string ParseHtml(string html)
{
string result = "";
result = Regex.Replace(html, " ", string.Empty, RegexOptions.IgnoreCase);
result = Regex.Replace(result, "<(\\s*)script([^>])*>", "<script>", RegexOptions.IgnoreCase );
result = Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", RegexOptions.IgnoreCase );
result = Regex.Replace(result, "(<script>).*(</script>)", string.Empty, RegexOptions.IgnoreCase );
result = Regex.Replace(result, "<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", RegexOptions.IgnoreCase );
result = Regex.Replace(result, "(<style>).*(</style>)", string.Empty, RegexOptions.IgnoreCase );
result = Regex.Replace(result, "<( )*head([^>])*>", "<head>", RegexOptions.IgnoreCase );
result = Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", RegexOptions.IgnoreCase );
result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase);
result = Regex.Replace(result, "<SPAN style=\"DISPLAY: none\">.{15,70}</SPAN>", string.Empty, RegexOptions.IgnoreCase);
result = Regex.Replace(result, "<FONT style=\"COLOR: #fff; FONT-SIZE: 0px\">.{15,70}</FONT>", string.Empty, RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(<br>)", "\r\n", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(<[^>]*>)", string.Empty, RegexOptions.IgnoreCase);
result = Regex.Replace(result, @";", string.Empty, RegexOptions.IgnoreCase);
return result;
}
處理前
處理後
打完收工~~又要windows update了