#!/usr/bin/env python #-*- coding: utf-8 -*- import os import pickle import re def tounicode(data): f = lambda d, enc: d.decode(enc); codecs = ['shift_jis','utf-8','euc_jp','cp932', 'euc_jis_2004','euc_jisx0213','iso2022_jp','iso2022_jp_1', 'iso2022_jp_2','iso2022_jp_2004','iso2022_jp_3','iso2022_jp_ext', 'shift_jis_2004','shift_jisx0213','utf_16','utf_16_be', 'utf_16_le','utf_7','utf_8_sig']; for codec in codecs: try: return f(data, codec); except: continue; return None; class Search: def __init__(self, database): """Load database.""" database = pickle.load(open(database)) self.tf = database["tf"] self.idf = database["idf"] def find(self, word): """Search word. If the word is found : return the list of (path, TFIDF score) If the word is not found: return None""" if type(word) == type(str()): word = tounicode(word) print word if not(word in self.idf.keys()): return None self.word = word results = [] idfw = self.idf[word] for fullname in self.tf.keys(): if word in self.tf[fullname].keys(): score = self.tf[fullname][word] * idfw results.append([fullname, score]) results.sort(lambda a, b: cmp(b[1], a[1])) self.results = results return results def showHTML(self, nbest=5): html = """ Search result

Search result

Keyword: %s

""" % (self.word) for rank, result in enumerate(self.results): if rank > nbest: break filename = result[0][:-3] + ".html" # extract title title = [line for line in open(filename).readlines() if "h1" in line or "h2" in line][0] title = re.sub("<.*?>", "", title) title = re.sub("^[ ]*", "", title) html += "
" html += "
Rank
%d (Score: %s)
" % ( rank + 1, result[1]) html += u"
Title
%s
" % (unicode(title, "utf-8")) html += "
Link
%s
" % (filename, filename) html += "
" html += "" return html def showTEXT(self, nbest=5): txt = "Search word: " + self.word + "\n" for rank, result in enumerate(self.results): if rank >= nbest: break filename = result[0][:-3] + ".html" title = [line for line in open(filename).readlines() if "h1" in line or "h2" in line][0] title = re.sub("<.*?>", "", title) title = re.sub("^[ ]*", "", title).rstrip() txt += """Rank %d (score: %s) Title: %s Link : %s """ % (rank + 1, result[1], unicode(title, "utf-8"), filename) return txt if __name__ == "__main__": import sys search = Search("database.pickle") #word = u"音響" word = unicode(sys.argv[1], "utf-8").encode("utf-8") if search.find(word): print search.showTEXT() #print search.showHTML() else: print "Not found."