集合知プログラミング3章をやってみる じゃぱん対応

tophatener の ランキング上位の人達に適応してみる

tophatenerというすてきなサイトがあるので、ここのデータを使わせてもらう。

opml

ElementTree使ったソースがあったので、ファイル1回落とす。
wget http://tophatenar.com/ranking/subscriber/1

URL直指定できるBeautiful Soupやlxmlの方がいいのだろうか。
まぁ、とりあえずこれですすめる。

opml からfeedlist.txt作成

以下実行して作成する。

#!/usr/bin/python
# -*- coding: utf8 -*-

import xml
from xml.etree import ElementTree

def create_feedlist(url):
    dom = ElementTree.parse(url)
    outlines = dom.findall("//outline")
    rss_uris = []
    for outline in outlines:
        if outline.attrib['xmlUrl']: rss_uris.append(outline.attrib['xmlUrl'])
    return rss_uris 

if __name__ == '__main__':
    feedlist = create_feedlist('topHatener_top.opml')
    fw = open(DIR + 'feedlist.txt', 'w')
    for feed in feedlist: fw.write(feed + '\n')
    fw.close()

feedlist.txtからblogdata.txt作成。

集合知プログラミング3章より。

splitter.py

#!/usr/bin/python
# -*- coding: utf8 -*-

from urllib import urlopen, quote_plus
from BeautifulSoup import BeautifulSoup


appid='yahoo api key'
pageurl='http://api.jlp.yahoo.co.jp/MAService/V1/parse'

def split(sentence, appid=appid, results='ma', filter='1|2|3|4|5|9|10' ):
    ret=[]
    sentence=quote_plus(sentence.encode('utf-8'))
    query="%s?appid=%s&results=%s&uniq_filter=%s&sentence=%s" % \
        (pageurl, appid, results, filter, sentence)
    soup = BeautifulSoup(urlopen(query))
    try: return [l.surface.string for l in soup.ma_result.word_list]
    except: return[]

generatefeedvector.py

#!/usr/bin/python
# -*- coding: utf8 -*-

import feedparser
import re, traceback, sys
import splitter

def getwordcounts(url):
    # Parse the feed
    d = feedparser.parse(url)
    wc = {}

    # Loop over all the entries
    for e in d.entries:
        if 'summary' in e: summary = e.summary
        else: summary = e.description

        # Extract a list of words
        words = getwords(e.title + ' ' + summary)
        for word in words:
            wc.setdefault(word, 0)
            wc[word] += 1

    return d.feed.title, wc

def getwords(html):
    """
    日本語ように変更
    """
    # Remove all the HTML tags
    txt = re.compile(r'<[^>]+>').sub('', html)
    return [s.lower() for s in splitter.split(txt) if s != '']

#def getwords(html):
#    # Remove all the HTML tags
#    txt = re.compile(r'<[^>]+>').sub('', html)
#    # Split words by all non-alpha characters
#    words = re.compile(r'[^A-Z^a-z]+').split(txt)
#    # Convert to lowercase
#    return [word.lower() for word in words if word != '']

if __name__ == '__main__':
    apcount = {}
    wordcounts = {}
    feedlist = [line for line in file('feedlist.txt')]
    for feedurl in feedlist:
        print feedurl
        try:
            title, wc = getwordcounts(feedurl)
            wordcounts[title] = wc
            for word, count in wc.items():
                apcount.setdefault(word,0)
                if count > 1:
                    apcount[word] += 1
        except:
            traceback.print_exc(file=sys.stdout)
            print 'Failed to parse feed %s' % feedurl
    
    wordlist = []
    for w, bc in apcount.items():
        frac = float(bc)/len(feedlist)
        if frac > 0.1 and frac < 0.5: wordlist.append(w)
    
    out = file('blogdata.txt', 'w')
    out.write('Blog')
    for word in wordlist: out.write('\t%s' % word)
    out.write('\n')
    for blog, wc in wordcounts.items():
        out.write(blog)
        for word in wordlist:
            if word in wc: out.write('\t%d' % wc[word])
            else: out.write('\t0')
        out.write('\n')

でもできあがったファイル(blogdata.txt)がひどい(ToT)

Blog	ブログ	へ	どう	だっ	だけ	だろ	日	・・・
ガ島通信	2	1	0	0	0	0	1	・・・

ちょっと文字解析のところ改造せんと。
とりあえず先に進む。

クラスターつくる

clusters.py

#!/usr/bin/python
# -*- coding: utf8 -*-

def readfile(filename):
    lines=[line for line in file(filename)]
    # First line is the column titles
    colnames=lines[0].strip().split('\t')[1:]
    rownames=[]
    data=[]
    for line in lines[1:]:
        p=line.strip().split('\t')
        # First column in each row is the rowname
        rownames.append(p[0])
        # The data for this row is the remainder of the row
        data.append([float(x) for x in p[1:]])
    return rownames,colnames,data

from math import sqrt
def pearson(v1,v2):
    # Simple sums
    sum1=sum(v1)
    sum2=sum(v2)
    # Sums of the squares
    sum1Sq=sum([pow(v,2) for v in v1])
    sum2Sq=sum([pow(v,2) for v in v2])
    # Sum of the products
    pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
    # Calculate r (Pearson score)
    num=pSum-(sum1*sum2/len(v1))
    den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
    if den==0: return 0
    return 1.0-num/den

class bicluster:
    def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
            self.left=left
            self.right=right
            self.vec=vec
            self.id=id
            self.distance=distance

def hcluster(rows,distance=pearson):
    distances={}
    currentclustid=-1
    # Clusters are initially just the rows
    clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
    while len(clust)>1:
        lowestpair=(0,1)
        closest=distance(clust[0].vec,clust[1].vec)
        # loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(i+1,len(clust)):
                # distances is the cache of distance calculations
                if (clust[i].id,clust[j].id) not in distances:
                        distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
                d=distances[(clust[i].id,clust[j].id)]
                if d<closest:
                        closest=d
                        lowestpair=(i,j)
        # calculate the average of the two clusters
        mergevec=[
        (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0
        for i in range(len(clust[0].vec))]
        # create the new cluster
        newcluster=bicluster(mergevec,left=clust[lowestpair[0]],
        right=clust[lowestpair[1]],
        distance=closest,id=currentclustid)
        # cluster ids that weren't in the original set are negative
        currentclustid-=1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)
    return clust[0]

def printclust(clust,labels=None,n=0):
    # indent to make a hierarchy layout
    for i in range(n): print ' ',
    if clust.id<0:
        # negative id means that this is branch
        print '-'
    else:
        # positive id means that this is an endpoint
        if labels==None: print clust.id
        else: print labels[clust.id]
    # now print the right and left branches
    if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
    if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)

from PIL import Image,ImageDraw
def getheight(clust):
    # Is this an endpoint? Then the height is just 1
    if clust.left==None and clust.right==None: return 1
    # Otherwise the height is the same of the heights of
    # each branch
    return getheight(clust.left)+getheight(clust.right)

def getdepth(clust):
    # The distance of an endpoint is 0.0
    if clust.left==None and clust.right==None: return 0
    return max(getdepth(clust.left),getdepth(clust.right))+clust.distance

def drawdendrogram(clust,labels,jpeg='clusters.jpg'):
    # height and width
    h=getheight(clust)*20
    w=1200
    depth=getdepth(clust)
    # width is fixed, so scale distances accordingly
    scaling=float(w-150)/depth
    # Create a new image with a white background
    img=Image.new('RGB',(w,h),(255,255,255))
    draw=ImageDraw.Draw(img)
    draw.line((0,h/2,10,h/2),fill=(255,0,0))
    # Draw the first node
    drawnode(draw,clust,10,(h/2),scaling,labels)
    img.save(jpeg,'JPEG')

def drawnode(draw,clust,x,y,scaling,labels):
    if clust.id<0:
        h1=getheight(clust.left)*20
        h2=getheight(clust.right)*20
        top=y-(h1+h2)/2
        bottom=y+(h1+h2)/2
        # Line length
        ll=clust.distance*scaling
        # Vertical line from this cluster to children
        draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
        # Horizontal line to left item
        draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
        # Horizontal line to right item
        draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
        drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
        drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
    else:
        # If this is an endpoint, draw the item label
        draw.text((x+5,y-7),labels[clust.id],(0,0,0))

# スクリプトの時だけ実行実行
if __name__ == '__main__':
    blognames,words,data=readfile('blogdata.txt')
    clust=hcluster(data)
    printclust(clust,labels=blognames)
    drawdendrogram(clust,blognames,jpeg=DIR + 'blogclust.jpg')

JPEG出力したのだが、文字化け。。。
どうもフォントとか入れる必要がある?ようなので、文字化け解消してみる。

そして文字化け

ググっていると、IPAフォント入れて対応しているページをみっけたので、これに従いIPAフォント入れて対応してみる。

以下からIPAフォントダウンロード
http://ossipedia.ipa.go.jp/ipafont/download.php?
インストール方法はここ
http://ossipedia.ipa.go.jp/ipafont/fontinstall.html

ただ単純にダウンロードしたファイルを解凍してやって、その中身のファイルを/Library/Fontにいれてやるだけ

そしてソースちょっと改造

font = ImageFont.truetype('/Library/Fonts/ipagp.ttf', 18, encoding='unic')

def drawnode(draw,clust,x,y,scaling,labels):
    if clust.id<0:
            :
            :
    else:
        # If this is an endpoint, draw the item label
        draw.text((x+5,y-7), unicode(labels[clust.id],'utf8'), (0,0,0), font=font) ←フォント指定

そしてJPEG出力したら、文字ばけなおった!!

f:id:shohu33:20080819023443j:image

でも分類めちゃくちゃ(ToT)
とりあえず分類精度は無視して、2次元の図まで次回は進んでみる。