集合知プログラミング3章をやってみる じゃぱん対応
移転しました。
tophatener の ランキング上位の人達に適応してみる
tophatenerというすてきなサイトがあるので、ここのデータを使わせてもらう。
opml
ElementTree使ったソースがあったので、ファイル1回落とす。
wget http://tophatenar.com/ranking/subscriber/1
URL直指定できるBeautiful Soupやlxmlの方がいいのだろうか。
まぁ、とりあえずこれですすめる。
opml からfeedlist.txt作成
以下実行して作成する。
#!/usr/bin/python # -*- coding: utf8 -*- import xml from xml.etree import ElementTree def create_feedlist(url): dom = ElementTree.parse(url) outlines = dom.findall("//outline") rss_uris = [] for outline in outlines: if outline.attrib['xmlUrl']: rss_uris.append(outline.attrib['xmlUrl']) return rss_uris if __name__ == '__main__': feedlist = create_feedlist('topHatener_top.opml') fw = open(DIR + 'feedlist.txt', 'w') for feed in feedlist: fw.write(feed + '\n') fw.close()
feedlist.txtからblogdata.txt作成。
集合知プログラミング3章より。
splitter.py
#!/usr/bin/python # -*- coding: utf8 -*- from urllib import urlopen, quote_plus from BeautifulSoup import BeautifulSoup appid='yahoo api key' pageurl='http://api.jlp.yahoo.co.jp/MAService/V1/parse' def split(sentence, appid=appid, results='ma', filter='1|2|3|4|5|9|10' ): ret=[] sentence=quote_plus(sentence.encode('utf-8')) query="%s?appid=%s&results=%s&uniq_filter=%s&sentence=%s" % \ (pageurl, appid, results, filter, sentence) soup = BeautifulSoup(urlopen(query)) try: return [l.surface.string for l in soup.ma_result.word_list] except: return[]
generatefeedvector.py
#!/usr/bin/python # -*- coding: utf8 -*- import feedparser import re, traceback, sys import splitter def getwordcounts(url): # Parse the feed d = feedparser.parse(url) wc = {} # Loop over all the entries for e in d.entries: if 'summary' in e: summary = e.summary else: summary = e.description # Extract a list of words words = getwords(e.title + ' ' + summary) for word in words: wc.setdefault(word, 0) wc[word] += 1 return d.feed.title, wc def getwords(html): """ 日本語ように変更 """ # Remove all the HTML tags txt = re.compile(r'<[^>]+>').sub('', html) return [s.lower() for s in splitter.split(txt) if s != ''] #def getwords(html): # # Remove all the HTML tags # txt = re.compile(r'<[^>]+>').sub('', html) # # Split words by all non-alpha characters # words = re.compile(r'[^A-Z^a-z]+').split(txt) # # Convert to lowercase # return [word.lower() for word in words if word != ''] if __name__ == '__main__': apcount = {} wordcounts = {} feedlist = [line for line in file('feedlist.txt')] for feedurl in feedlist: print feedurl try: title, wc = getwordcounts(feedurl) wordcounts[title] = wc for word, count in wc.items(): apcount.setdefault(word,0) if count > 1: apcount[word] += 1 except: traceback.print_exc(file=sys.stdout) print 'Failed to parse feed %s' % feedurl wordlist = [] for w, bc in apcount.items(): frac = float(bc)/len(feedlist) if frac > 0.1 and frac < 0.5: wordlist.append(w) out = file('blogdata.txt', 'w') out.write('Blog') for word in wordlist: out.write('\t%s' % word) out.write('\n') for blog, wc in wordcounts.items(): out.write(blog) for word in wordlist: if word in wc: out.write('\t%d' % wc[word]) else: out.write('\t0') out.write('\n')
でもできあがったファイル(blogdata.txt)がひどい(ToT)
Blog ブログ へ どう だっ だけ だろ 日 ・・・ ガ島通信 2 1 0 0 0 0 1 ・・・
ちょっと文字解析のところ改造せんと。
とりあえず先に進む。
クラスターつくる
clusters.py
#!/usr/bin/python # -*- coding: utf8 -*- def readfile(filename): lines=[line for line in file(filename)] # First line is the column titles colnames=lines[0].strip().split('\t')[1:] rownames=[] data=[] for line in lines[1:]: p=line.strip().split('\t') # First column in each row is the rowname rownames.append(p[0]) # The data for this row is the remainder of the row data.append([float(x) for x in p[1:]]) return rownames,colnames,data from math import sqrt def pearson(v1,v2): # Simple sums sum1=sum(v1) sum2=sum(v2) # Sums of the squares sum1Sq=sum([pow(v,2) for v in v1]) sum2Sq=sum([pow(v,2) for v in v2]) # Sum of the products pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) # Calculate r (Pearson score) num=pSum-(sum1*sum2/len(v1)) den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) if den==0: return 0 return 1.0-num/den class bicluster: def __init__(self,vec,left=None,right=None,distance=0.0,id=None): self.left=left self.right=right self.vec=vec self.id=id self.distance=distance def hcluster(rows,distance=pearson): distances={} currentclustid=-1 # Clusters are initially just the rows clust=[bicluster(rows[i],id=i) for i in range(len(rows))] while len(clust)>1: lowestpair=(0,1) closest=distance(clust[0].vec,clust[1].vec) # loop through every pair looking for the smallest distance for i in range(len(clust)): for j in range(i+1,len(clust)): # distances is the cache of distance calculations if (clust[i].id,clust[j].id) not in distances: distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) d=distances[(clust[i].id,clust[j].id)] if d<closest: closest=d lowestpair=(i,j) # calculate the average of the two clusters mergevec=[ (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))] # create the new cluster newcluster=bicluster(mergevec,left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,id=currentclustid) # cluster ids that weren't in the original set are negative currentclustid-=1 del clust[lowestpair[1]] del clust[lowestpair[0]] clust.append(newcluster) return clust[0] def printclust(clust,labels=None,n=0): # indent to make a hierarchy layout for i in range(n): print ' ', if clust.id<0: # negative id means that this is branch print '-' else: # positive id means that this is an endpoint if labels==None: print clust.id else: print labels[clust.id] # now print the right and left branches if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) if clust.right!=None: printclust(clust.right,labels=labels,n=n+1) from PIL import Image,ImageDraw def getheight(clust): # Is this an endpoint? Then the height is just 1 if clust.left==None and clust.right==None: return 1 # Otherwise the height is the same of the heights of # each branch return getheight(clust.left)+getheight(clust.right) def getdepth(clust): # The distance of an endpoint is 0.0 if clust.left==None and clust.right==None: return 0 return max(getdepth(clust.left),getdepth(clust.right))+clust.distance def drawdendrogram(clust,labels,jpeg='clusters.jpg'): # height and width h=getheight(clust)*20 w=1200 depth=getdepth(clust) # width is fixed, so scale distances accordingly scaling=float(w-150)/depth # Create a new image with a white background img=Image.new('RGB',(w,h),(255,255,255)) draw=ImageDraw.Draw(img) draw.line((0,h/2,10,h/2),fill=(255,0,0)) # Draw the first node drawnode(draw,clust,10,(h/2),scaling,labels) img.save(jpeg,'JPEG') def drawnode(draw,clust,x,y,scaling,labels): if clust.id<0: h1=getheight(clust.left)*20 h2=getheight(clust.right)*20 top=y-(h1+h2)/2 bottom=y+(h1+h2)/2 # Line length ll=clust.distance*scaling # Vertical line from this cluster to children draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0)) # Horizontal line to left item draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0)) # Horizontal line to right item draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0)) drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels) drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels) else: # If this is an endpoint, draw the item label draw.text((x+5,y-7),labels[clust.id],(0,0,0)) # スクリプトの時だけ実行実行 if __name__ == '__main__': blognames,words,data=readfile('blogdata.txt') clust=hcluster(data) printclust(clust,labels=blognames) drawdendrogram(clust,blognames,jpeg=DIR + 'blogclust.jpg')
JPEG出力したのだが、文字化け。。。
どうもフォントとか入れる必要がある?ようなので、文字化け解消してみる。
そして文字化け
ググっていると、IPAフォント入れて対応しているページをみっけたので、これに従いIPAフォント入れて対応してみる。
以下からIPAフォントダウンロード http://ossipedia.ipa.go.jp/ipafont/download.php? インストール方法はここ http://ossipedia.ipa.go.jp/ipafont/fontinstall.html
ただ単純にダウンロードしたファイルを解凍してやって、その中身のファイルを/Library/Fontにいれてやるだけ
そしてソースちょっと改造
font = ImageFont.truetype('/Library/Fonts/ipagp.ttf', 18, encoding='unic') def drawnode(draw,clust,x,y,scaling,labels): if clust.id<0: : : else: # If this is an endpoint, draw the item label draw.text((x+5,y-7), unicode(labels[clust.id],'utf8'), (0,0,0), font=font) ←フォント指定
そしてJPEG出力したら、文字ばけなおった!!
でも分類めちゃくちゃ(ToT)
とりあえず分類精度は無視して、2次元の図まで次回は進んでみる。