和JS版功能相同,只是为了提高性能才用C#重写一次.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Threading;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using MySql.Data;
using MySql.Data.MySqlClient;
namespace 网页关键词处理
{
public partial class keywords : Form
{
public keywords()
{
InitializeComponent();
}
private Thread t;
delegate void SetTextCallback(string text);
private void start_Click(object sender, EventArgs e){
if (start.Text.Equals("stop")) {
this.t.Abort();
start.Text = "start";
}else{
this.t = new Thread(new ThreadStart(this.threadsafe));
this.t.Start();
start.Text = "stop";
}
}
private void threadsafe() {
//获取数据库连接
string _sha1 = "";
string _url = "";
if (url.Text.Trim().Equals(""))
{
MySqlConnection conn = new MySqlConnection("server=" + db_servers.Text.Trim() + ";user=" + db_uid.Text.Trim() + ";database=" + db_db.Text.Trim() + ";port=3306;password=" + db_pwd.Text.Trim() + ";charset=utf8");
try
{
conn.Open();
MySqlCommand cmd = new MySqlCommand("SELECT url_sha1,url_link FROM url ORDER BY url_update,url_error ASC LIMIT 1", conn);
MySqlDataReader rdr = cmd.ExecuteReader();
while (rdr.Read()) {
_sha1 = rdr.GetString(0).Trim();
_url = rdr.GetString(1).Trim();
}
rdr.Close();
cmd.Dispose();
rdr.Dispose();
rdr = null;
cmd = null;
conn.Close();
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
conn.Dispose();
conn = null;
}else {
_sha1 = "";
_url = url.Text.Trim();
}
show("访问:"+_url);
try
{
html h = new html(gethttp(_url));
string title = h.Title;
string body = h.Body;
MySqlConnection conn = new MySqlConnection("server=" + db_servers.Text.Trim() + ";user=" + db_uid.Text.Trim() + ";database=" + db_db.Text.Trim() + ";port=3306;password=" + db_pwd.Text.Trim() + ";charset=utf8");
conn.Open();
MySqlCommand cmd = new MySqlCommand("UPDATE url SET url_title='"+title+"',url_keywords = '"+body+"',url_update = now() WHERE url_sha1 = '" + _sha1 + "'", conn);
cmd.ExecuteNonQuery();
cmd.Dispose();
cmd = null;
conn.Close();
conn.Dispose();
conn = null;
show(title);
show(body);
}catch( Exception e){
//记录错误
show("访问出错!" + e.Message);
MySqlConnection conn = new MySqlConnection("server=" + db_servers.Text.Trim() + ";user=" + db_uid.Text.Trim() + ";database=" + db_db.Text.Trim() + ";port=3306;password=" + db_pwd.Text.Trim() + ";charset=utf8");
conn.Open();
MySqlCommand cmd = new MySqlCommand("UPDATE url SET url_update = now(),url_error= url_error + 1 WHERE url_sha1 = '" + _sha1 + "'", conn);
cmd.ExecuteNonQuery();
cmd.Dispose();
cmd = null;
conn.Close();
conn.Dispose();
conn = null;
}
this.threadsafe();
}
private string gethttp(string url) {
WebClient wc = new WebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
wc.Headers.Add("User-Agent", "Mozilla/5.0+(compatible;+cpcunionbot/1.0;++http://www.cpcunion.com/bot.html)");
Byte[] pageData = wc.DownloadData(url);
string html;
string reponseencoding = wc.ResponseHeaders.GetValues("Content-Type").GetValue(0).ToString();
reponseencoding = reponseencoding.Substring(reponseencoding.IndexOf("charset=")+8);
try
{
return Encoding.GetEncoding(reponseencoding).GetString(pageData);
}catch(Exception e){
html = Encoding.Default.GetString(pageData);
try {
Regex reg_charset = new Regex(@"charset\b\s*=\s*(?[^""]*)");
string enconding = null;
if (reg_charset.IsMatch(html))
enconding = reg_charset.Match(html).Groups["charset"].Value;
else
enconding = Encoding.Default.EncodingName;
if (!Encoding.Default.EncodingName.Equals(enconding))
html = Encoding.GetEncoding(enconding).GetString(pageData);
}catch(Exception ex){
show(ex.Message);
}
return html;
}
}
private void show(string msg) {
if (info.InvokeRequired)
{
SetTextCallback d = new SetTextCallback(show);
this.Invoke(d, new object[] { msg });
}
else {
info.Items.Add(msg);
if (info.Items.Count > 30) {
info.Items.Clear();
}
}
}
private void keywords_Load(object sender, EventArgs e) { }
private void button1_Click(object sender, EventArgs e)
{
html h = new html(gethttp(url.Text.Trim()));
string title = h.Title;
string body = h.Body;
MessageBox.Show("网页标题:"+title);
MessageBox.Show("参考关键词:"+body);
h = null;
}
}
}
using System;
using System.Collections.Generic;
using System.Collections;
using System.Text;
using System.Web;
using System.Text.RegularExpressions;
namespace 网页关键词处理
{
class html
{
private const int cn_max = 8;
private const int cn_min = 2;
private const int en_max = 84;
private const int en_min = 2;
private const string endchars = " ;:,:,..。'“”??_{}/\\·[]()()、…ˉ-!!<>《》〉〈〕〔【】〗〖[]{}『』」「¥—";
private string title;
public string Title
{
get { return title; }
}
private string body;
public string Body
{
get { return body; }
}
private string keywords;
public string Keywords
{
get { return keywords; }
}
private string description;
public string Description
{
get { return description; }
}
public html(string code) {
//获取标题
this.title = this.getTitle(code).Trim();
this.body = this.getBody(code).Trim();
}
//获取body
private string getBody(string code) {
string strfrom = NoHTML(code);
//return strfrom;
int strlength = strfrom.Length;
int chscount, endasc, cx, ca, oldcount=0;
string tempstr, oldchar="";
char endchar;
Dictionary strdic = new Dictionary();
//中英文分词
for (var i = 0; i < strlength; i++) {
chscount = 0;
for (var j = 1; j < en_max; j++) {
if ((i + j) <= strlength){
tempstr = strfrom.Substring(i, j);
endasc = (int)Encoding.Default.GetBytes(tempstr)[j-1];
endchar = tempstr[j-1];
if (endchars.IndexOf(endchar) != -1 || (endasc >= 0 && endasc <= 47) || (endasc >= 58 && endasc <= 64) || (endasc >= 91 && endasc <= 96))
{
break;
}else{
if(endasc <= 0 || endasc >= 254)chscount++;
if (chscount > cn_max) break;
if (tempstr.Length > 1){
//标题匹配加倍权重
ca = (strlength - strfrom.Replace(tempstr, String.Empty).Length) / tempstr.Length;
cx = this.title.IndexOf(tempstr) != -1? ca*2 : ca;
//丢弃只出现一次的词语
if(cx>0){
if (!strdic.ContainsKey(tempstr)){
strdic.Add(tempstr, cx);
}
if (oldcount > 0 && tempstr.IndexOf(oldchar) == 0){
strdic[oldchar] = oldcount - cx;
//丢弃只出现一次的词语
if(strdic[oldchar]<2)strdic.Remove(oldchar);
}
oldchar = tempstr;
oldcount= cx;
};
}
}
}
}
}
int c = strdic.Count;
string[] a = new string[c];
int[] b = new int[c];
int ti = 0;
string upa="",doa="";
int upb=0,dob=0;
keys[] keys = new keys[c];
strdic.Keys.CopyTo(a,0);
strdic.Values.CopyTo(b,0);
for(int i=c-1;i>=0;i--){
if(i>0){
upa = a[i];
upb = b[i];
doa = a[i-1];
dob = b[i-1];
if(doa.IndexOf(upa)==doa.Length-upa.Length){
upb = upb - dob;
}
if(upb>1){//丢弃只出现一次的词语
keys[ti] = new keys();
keys[ti].Str = upa;
keys[ti].Count=upb;
ti++;
}
}else{
keys[ti] = new keys();
keys[ti].Str = doa;
keys[ti].Count=dob;
}
}
//按匹配次数排序
bool py = true;
int pyd, pye, pyf;
string pys;
c = ti;
while(py){
py = false;
for(int i=0;i= c)break;
pyd = keys[i].Count;
pye = keys[i+1].Count;
if(pyd= 255)
break;
else
keystring += " ";
}
if (keystring.Length > 255) keystring = keystring.Substring(0, 255);
return keystring;
}
public string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"(?is)"," ");
Htmlstring = Regex.Replace(Htmlstring, @"(?is)", " ");
Htmlstring = Regex.Replace(Htmlstring, @"(?is)", " ");
Htmlstring = Regex.Replace(Htmlstring, @"(?is)<.*?>", " ");
Htmlstring = HttpUtility.HtmlDecode(Htmlstring);
//删除符号
//Htmlstring = Htmlstring.Replace("(", " ");
//Htmlstring = Htmlstring.Replace(")", " ");
//Htmlstring = Htmlstring.Replace("|", " ");
//Htmlstring = Htmlstring.Replace(",", " ");
//Htmlstring = Htmlstring.Replace("@", " ");
//Htmlstring = Htmlstring.Replace(";", " ");
//Htmlstring = Htmlstring.Replace(".", " ");
//Htmlstring = Htmlstring.Replace("]", " ");
//Htmlstring = Htmlstring.Replace("[", " ");
//Htmlstring = Htmlstring.Replace(">", " ");
//Htmlstring = Htmlstring.Replace("<", " ");
//Htmlstring = Htmlstring.Replace("*", " ");
Htmlstring = Regex.Replace(Htmlstring, @"(?is)(\s+|\n+)", " ");
return Htmlstring;
}
//获取描述
private string getDescription(string code) {
Regex r = new Regex("(?is)(?<=).*?(?= )");
Match m = r.Match(code);
return m.Success ? m.Value : "";
}
//获取关键词
private string getKeywords(string code) {
Regex r = new Regex("(?is)(?<=).*?(?= )");
Match m = r.Match(code);
return m.Success ? m.Value : "";
}
//获取页面标题
private string getTitle(string code) {
Regex r = new Regex("(?is)(?<=).*?(?= )");
Match m = r.Match(code);
string tit = m.Success? m.Value:"";
if (tit.Length > 50) tit = tit.Substring(0, 50);
return tit;
}
}
}
using System;
using System.Collections.Generic;
using System.Text;
namespace 网页关键词处理
{
class keys
{
private string str;
private int count;
public string Str
{
get { return str; }
set { str = value; }
}
public int Count
{
get { return count; }
set { count = value; }
}
}
}
Leave a Reply