Home »» C# »» 词频分析

词频分析

by zhuolin | posted: 2014年3月1日 0 Comment

和JS版功能相同,只是为了提高性能才用C#重写一次.

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Threading;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using MySql.Data;
using MySql.Data.MySqlClient;

namespace 网页关键词处理
{
    public partial class keywords : Form
    {
        public keywords()
        {
            InitializeComponent();
        }
        private Thread t;
        delegate void SetTextCallback(string text);
        private void start_Click(object sender, EventArgs e){
            if (start.Text.Equals("stop")) {
                this.t.Abort();
                start.Text = "start";
            }else{
                this.t = new Thread(new ThreadStart(this.threadsafe));
                this.t.Start();
                start.Text = "stop";
            }
        }
        private void threadsafe() {
            //获取数据库连接
            string _sha1 = "";
            string _url = "";
            if (url.Text.Trim().Equals(""))
            {
                MySqlConnection conn = new MySqlConnection("server=" + db_servers.Text.Trim() + ";user=" + db_uid.Text.Trim() + ";database=" + db_db.Text.Trim() + ";port=3306;password=" + db_pwd.Text.Trim() + ";charset=utf8");
                try
                {
                    conn.Open();
                    MySqlCommand cmd = new MySqlCommand("SELECT url_sha1,url_link FROM url ORDER BY url_update,url_error ASC LIMIT 1", conn);
                    MySqlDataReader rdr = cmd.ExecuteReader();
                    while (rdr.Read()) {
                        _sha1 = rdr.GetString(0).Trim();
                        _url = rdr.GetString(1).Trim();
                    }
                    rdr.Close();
                    cmd.Dispose();
                    rdr.Dispose();
                    rdr = null;
                    cmd = null;
                    conn.Close();
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
                conn.Dispose();
                conn = null;
            }else {
                _sha1 = "";
                _url = url.Text.Trim();
            }
            show("访问:"+_url);
            try
            {
                html h = new html(gethttp(_url));
                string title = h.Title;
                string body = h.Body;
                MySqlConnection conn = new MySqlConnection("server=" + db_servers.Text.Trim() + ";user=" + db_uid.Text.Trim() + ";database=" + db_db.Text.Trim() + ";port=3306;password=" + db_pwd.Text.Trim() + ";charset=utf8");
                conn.Open();
                MySqlCommand cmd = new MySqlCommand("UPDATE url SET url_title='"+title+"',url_keywords = '"+body+"',url_update = now() WHERE url_sha1 = '" + _sha1 + "'", conn);
                cmd.ExecuteNonQuery();
                cmd.Dispose();
                cmd = null;
                conn.Close();
                conn.Dispose();
                conn = null;
                show(title);
                show(body);
            }catch( Exception e){
                //记录错误
                show("访问出错!" + e.Message);
                MySqlConnection conn = new MySqlConnection("server=" + db_servers.Text.Trim() + ";user=" + db_uid.Text.Trim() + ";database=" + db_db.Text.Trim() + ";port=3306;password=" + db_pwd.Text.Trim() + ";charset=utf8");
                conn.Open();
                MySqlCommand cmd = new MySqlCommand("UPDATE url SET url_update = now(),url_error= url_error + 1 WHERE url_sha1 = '" + _sha1 + "'", conn);
                cmd.ExecuteNonQuery();
                cmd.Dispose();
                cmd = null;
                conn.Close();
                conn.Dispose();
                conn = null;
            }
            this.threadsafe();
        }
        private string gethttp(string url) {
            WebClient wc = new WebClient();
                      wc.Credentials = CredentialCache.DefaultCredentials;
                      wc.Headers.Add("User-Agent", "Mozilla/5.0+(compatible;+cpcunionbot/1.0;++http://www.cpcunion.com/bot.html)");
            Byte[] pageData = wc.DownloadData(url);
            string html;
            string reponseencoding = wc.ResponseHeaders.GetValues("Content-Type").GetValue(0).ToString();
                reponseencoding = reponseencoding.Substring(reponseencoding.IndexOf("charset=")+8);
                try
                {
                    return Encoding.GetEncoding(reponseencoding).GetString(pageData);
                }catch(Exception e){
                    html = Encoding.Default.GetString(pageData);
                    try {
                        Regex reg_charset = new Regex(@"charset\b\s*=\s*(?[^""]*)");
                        string enconding = null;
                        if (reg_charset.IsMatch(html))
                            enconding = reg_charset.Match(html).Groups["charset"].Value;
                        else
                            enconding = Encoding.Default.EncodingName;
                        if (!Encoding.Default.EncodingName.Equals(enconding))
                            html = Encoding.GetEncoding(enconding).GetString(pageData);
                    }catch(Exception ex){
                        show(ex.Message);
                    }
                    return html;
                }
        }
        private void show(string msg) {
            if (info.InvokeRequired)
            {
                SetTextCallback d = new SetTextCallback(show);
                this.Invoke(d, new object[] { msg });
            }
            else {
                info.Items.Add(msg);
                if (info.Items.Count > 30) {
                    info.Items.Clear();
                }
            }
        }
        private void keywords_Load(object sender, EventArgs e) { }

        private void button1_Click(object sender, EventArgs e)
        {
            html h = new html(gethttp(url.Text.Trim()));
            string title = h.Title;
            string body = h.Body;
            MessageBox.Show("网页标题:"+title);
            MessageBox.Show("参考关键词:"+body);
            h = null;
        }
    }
}

using System;
using System.Collections.Generic;
using System.Collections;
using System.Text;
using System.Web;
using System.Text.RegularExpressions;
namespace 网页关键词处理
{
    class html
    {
        private const int cn_max = 8; 
        private const int cn_min = 2;
        private const int en_max = 84;
        private const int en_min = 2;
        private const string endchars = " ;:,：，．.。'“”?？_{}/\\·[]()（）、…ˉ-!！<>《》〉〈〕〔【】〗〖［］｛｝『』」「￥—";
        private string title;

        public string Title
        {
            get { return title; }
        }
        private string body;

        public string Body
        {
            get { return body; }
        }
        private string keywords;

        public string Keywords
        {
            get { return keywords; }
        }
        private string description;

        public string Description
        {
            get { return description; }
        }
        public html(string code) {
            //获取标题
            this.title = this.getTitle(code).Trim();
            this.body = this.getBody(code).Trim();
        }
        //获取body
        private string getBody(string code) {
            string strfrom = NoHTML(code);
            //return strfrom;
            int strlength = strfrom.Length;
            int chscount, endasc, cx, ca, oldcount=0;
            string tempstr, oldchar="";
            char endchar;
            Dictionary strdic = new Dictionary();
            //中英文分词
            for (var i = 0; i < strlength; i++) {
                chscount = 0;
                for (var j = 1; j < en_max; j++) {
                    if ((i + j) <= strlength){
                        tempstr = strfrom.Substring(i, j);
                        endasc = (int)Encoding.Default.GetBytes(tempstr)[j-1];
                        endchar = tempstr[j-1];
                        if (endchars.IndexOf(endchar) != -1 || (endasc >= 0 && endasc <= 47) || (endasc >= 58 && endasc <= 64) || (endasc >= 91 && endasc <= 96))
                        {
                            break;
                        }else{
                            if(endasc <= 0 || endasc >= 254)chscount++;
                            if (chscount > cn_max) break;
                            if (tempstr.Length > 1){
                                //标题匹配加倍权重
                                ca = (strlength - strfrom.Replace(tempstr, String.Empty).Length) / tempstr.Length;
                                cx = this.title.IndexOf(tempstr) != -1? ca*2 : ca;
                                //丢弃只出现一次的词语
                                if(cx>0){
                                    if (!strdic.ContainsKey(tempstr)){
                                        strdic.Add(tempstr, cx);
                                    }
                                    if (oldcount > 0 && tempstr.IndexOf(oldchar) == 0){
                                        strdic[oldchar] = oldcount - cx;
                                        //丢弃只出现一次的词语
                                        if(strdic[oldchar]<2)strdic.Remove(oldchar);
                                    }
                                    oldchar = tempstr;
                                    oldcount= cx;
                                };
                            }
                        }

                    }
                }            
            
            }
            int c = strdic.Count;
           string[] a = new string[c];
           int[] b = new int[c];
           int ti = 0;
            string upa="",doa="";
            int upb=0,dob=0;
            keys[] keys = new keys[c];
            strdic.Keys.CopyTo(a,0);
            strdic.Values.CopyTo(b,0);
            for(int i=c-1;i>=0;i--){
                if(i>0){
                    upa = a[i];
                    upb = b[i];
                    doa = a[i-1];
                    dob = b[i-1];
                    if(doa.IndexOf(upa)==doa.Length-upa.Length){
                        upb = upb - dob;
                    }
                    if(upb>1){//丢弃只出现一次的词语
                        keys[ti] = new keys();
                        keys[ti].Str = upa;
                        keys[ti].Count=upb;
                        ti++;
                    }
                }else{
                    keys[ti] = new keys();
                    keys[ti].Str = doa;
                    keys[ti].Count=dob;
                }
            }
            //按匹配次数排序
            bool py = true;
            int pyd, pye, pyf;
            string pys;
            c = ti;
            while(py){
                py = false;
                for(int i=0;i= c)break;
                    pyd = keys[i].Count;
                    pye = keys[i+1].Count;
            if(pyd= 255)
                    break;
                else
                    keystring += " ";
            
            }
            if (keystring.Length > 255) keystring = keystring.Substring(0, 255);
                return keystring;
        }
        public string NoHTML(string Htmlstring)
        {
            //删除脚本
            Htmlstring = Regex.Replace(Htmlstring, @"(?is)"," ");
            Htmlstring = Regex.Replace(Htmlstring, @"(?is)", " ");
            Htmlstring = Regex.Replace(Htmlstring, @"(?is)", " ");
            Htmlstring = Regex.Replace(Htmlstring, @"(?is)<.*?>", " ");
            Htmlstring = HttpUtility.HtmlDecode(Htmlstring);
            //删除符号
            //Htmlstring = Htmlstring.Replace("(", " ");
            //Htmlstring = Htmlstring.Replace(")", " ");
            //Htmlstring = Htmlstring.Replace("|", " ");
            //Htmlstring = Htmlstring.Replace(",", " ");
            //Htmlstring = Htmlstring.Replace("@", " ");
            //Htmlstring = Htmlstring.Replace(";", " ");
            //Htmlstring = Htmlstring.Replace(".", " ");
            //Htmlstring = Htmlstring.Replace("]", " ");
            //Htmlstring = Htmlstring.Replace("[", " ");
            //Htmlstring = Htmlstring.Replace(">", " ");
            //Htmlstring = Htmlstring.Replace("<", " ");
            //Htmlstring = Htmlstring.Replace("*", " ");


            Htmlstring = Regex.Replace(Htmlstring, @"(?is)(\s+|\n+)", " ");
            return Htmlstring;
        }
        //获取描述
        private string getDescription(string code) {
            Regex r = new Regex("(?is)(?<=).*?(?=)");
            Match m = r.Match(code);
            return m.Success ? m.Value : "";
        }
        //获取关键词
        private string getKeywords(string code) {
            Regex r = new Regex("(?is)(?<=).*?(?=)");
            Match m = r.Match(code);
            return m.Success ? m.Value : "";
        }
        //获取页面标题
        private string getTitle(string code) {
            Regex r = new Regex("(?is)(?<=).*?(?=)");
            Match m = r.Match(code);  
            string tit = m.Success? m.Value:"";
            if (tit.Length > 50) tit = tit.Substring(0, 50);
            return tit;
        }
    }
}

using System;
using System.Collections.Generic;
using System.Text;

namespace 网页关键词处理
{
    class keys
    {
        private string str;
        private int count;
        public string Str
        {
            get { return str; }
            set { str = value; }
        }

        public int Count
        {
            get { return count; }
            set { count = value; }
        }
    }
}

C#

zhuolin

View all posts by zhuolin »»

Leave a Reply Cancel reply

Proudly powered by WordPress Premium Style Theme by www.gopiplus.com