#!/usr/bin/env python
#-*- coding: utf-8 -*-
import os
import pickle
import re
def tounicode(data):
f = lambda d, enc: d.decode(enc);
codecs = ['shift_jis','utf-8','euc_jp','cp932',
'euc_jis_2004','euc_jisx0213','iso2022_jp','iso2022_jp_1',
'iso2022_jp_2','iso2022_jp_2004','iso2022_jp_3','iso2022_jp_ext',
'shift_jis_2004','shift_jisx0213','utf_16','utf_16_be',
'utf_16_le','utf_7','utf_8_sig'];
for codec in codecs:
try: return f(data, codec);
except: continue;
return None;
class Search:
def __init__(self, database):
"""Load database."""
database = pickle.load(open(database))
self.tf = database["tf"]
self.idf = database["idf"]
def find(self, word):
"""Search word.
If the word is found : return the list of (path, TFIDF score)
If the word is not found: return None"""
if type(word) == type(str()):
word = tounicode(word)
print word
if not(word in self.idf.keys()):
return None
self.word = word
results = []
idfw = self.idf[word]
for fullname in self.tf.keys():
if word in self.tf[fullname].keys():
score = self.tf[fullname][word] * idfw
results.append([fullname, score])
results.sort(lambda a, b: cmp(b[1], a[1]))
self.results = results
return results
def showHTML(self, nbest=5):
html = """
Search result
Search result
Keyword: %s
""" % (self.word)
for rank, result in enumerate(self.results):
if rank > nbest:
break
filename = result[0][:-3] + ".html"
# extract title
title = [line for line in open(filename).readlines()
if "h1" in line or "h2" in line][0]
title = re.sub("<.*?>", "", title)
title = re.sub("^[ ]*", "", title)
html += "
"
html += "- Rank
- %d (Score: %s)
" % (
rank + 1, result[1])
html += u"- Title
- %s
" % (unicode(title, "utf-8"))
html += "- Link
- %s
" % (filename, filename)
html += "
"
html += ""
return html
def showTEXT(self, nbest=5):
txt = "Search word: " + self.word + "\n"
for rank, result in enumerate(self.results):
if rank >= nbest:
break
filename = result[0][:-3] + ".html"
title = [line for line in open(filename).readlines()
if "h1" in line or "h2" in line][0]
title = re.sub("<.*?>", "", title)
title = re.sub("^[ ]*", "", title).rstrip()
txt += """Rank %d (score: %s)
Title: %s
Link : %s
""" % (rank + 1, result[1], unicode(title, "utf-8"), filename)
return txt
if __name__ == "__main__":
import sys
search = Search("database.pickle")
#word = u"音響"
word = unicode(sys.argv[1], "utf-8").encode("utf-8")
if search.find(word):
print search.showTEXT()
#print search.showHTML()
else:
print "Not found."