不需要词库,直接分割网页内容提取词语.并且计算词语出现次数按照从多到少排序, 这里能区分中英文词语,最开始用于广告匹配数据预处理.用js在访客客户端执行
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | <script type="text/javascript"> //石卓林 2008-7-12 第二版.左右右左匹配版 function keywords(ftitle,ftbody){ this.trim = function(text){return text.replace(/(^\s*)|(\s*$)/g,'');} this.title = ftitle; this.tbody = ftbody.replace(/(\s+)/g,' ');//.substr(40,400);//截取最可能的内容此处数字需改进 this.tbody = this.trim(this.tbody); this.tbodylen = this.tbody.length; this.chardic = new ActiveXObject('Scripting.Dictionary'); this.tempasc = 0; this.tempchar = ''; this.tempcharat=''; this.endchar = '。,:… (—)》《'; this.chscount = 0; this.keys = new Array(); var oldchar='',oldcount=0; for(var i=0;i<this.tbodylen;i++){ this.chscount = 0; for(var j=1;j<=15;j++){//最长英文单词15 this.tempchar = this.tbody.substr(i,j); this.tempasc = this.tempchar.charCodeAt(j-1); this.tempcharat = this.tempchar.charAt(j-1); if((this.endchar.indexOf(this.tempcharat) != -1)||(this.tempasc >=0 && this.tempasc <= 47)||(this.tempasc >=58 && this.tempasc <= 64)||(this.tempasc >=91 && this.tempasc <= 96)||(this.tempasc >=123 && this.tempasc <= 254)){ break; }else{ if(this.tempasc <= 0 || this.tempasc >= 254){this.chscount++;} if(this.chscount>8){break;}//最长中文词语8 if(this.tempchar.length>1){//只提取大于1词 cx = eval('this.title.match(/'+this.tempchar+'/g)'); if(cx != null){cx=cx.length*2}else{cx=0};//标题匹配加权 2 倍权重 cx = eval('this.tbody.match(/'+this.tempchar+'/g).length'); if(cx>1){//丢弃只出现一次的词语 this.chardic.item(this.tempchar) = cx; if(oldcount>0 && this.tempchar.indexOf(oldchar)==0){//从左到右匹配 this.chardic.item(oldchar) = oldcount - cx; if(this.chardic.item(oldchar)<2){//丢弃只出现一次的词语 this.chardic.Remove(oldchar); } } oldchar = this.tempchar; oldcount= cx; }; } } } } //转换数组 从右到左匹配. var a = (new VBArray(this.chardic.Keys())).toArray(); var b = (new VBArray(this.chardic.Items())).toArray(); var c = a.length; var ti = 0; for(var i=c-1;i>=0;i--){ if(i>0){ upa = a[i]; upb = b[i]; doa = a[i-1]; dob = b[i-1]; if(doa.indexOf(upa)==doa.length-upa.length){ upb = upb - dob; } if(upb>1){//丢弃只出现一次的词语 this.keys[ti] = {}; this.keys[ti].str = upa; this.keys[ti].count=upb; ti++; } }else{ this.keys[ti] = {} this.keys[ti].str = doa; this.keys[ti].count=dob; } } //按匹配次数排序 var py = true; c = this.keys.length; while(py){ py = false; for(var i=0;i<c;i++){ if((i+1)>=c){break} pyd = this.keys[i].count; pye = this.keys[i+1].count; if(pyd<pye){ pyf = this.keys[i].count; pys = this.keys[i].str; this.keys[i].count = this.keys[i+1].count; this.keys[i].str = this.keys[i+1].str; this.keys[i+1].count = pyf; this.keys[i+1].str = pys; py = true; } } } return this.keys; } document.attachEvent('onreadystatechange',fnStartInit); //window.onload = fnStartInit; function fnStartInit(){ frames[0].location.href = 'http://www.163.com'; // //if(document.readyState=="interactive"){/ // document.write('<iframe scrolling="no" frameborder="0" width="240" height="120"></iframe>'); //var word = new keywords(document.title,document.body.innerText); // //document.write(word.join(' | ')); // //} } </script> <iframe scrolling="no" frameborder="0" width="240" height="320" name="adid" id="adid"></iframe> |
Leave a Reply