#!/usr/bin/env python
#    (C) 2008 Parag Nemade <paragn@fedoraprojecrt.org>
# 
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

import codecs
import os
import gobject
import sys
import string
import re

wordsfile = "/tmp/words.txt"
wordstmpfile = "/tmp/wordsdic.txt"
dicfile = "/tmp/words.dic"

class WordXt:
    def __init__(self):
       arguments = sys.argv[1:]
       count = len(arguments)
       if count < 2 or count > 2:
          print "usage:wordxtr <LangCode> <Full directory path to text data files>"
          exit()
       if len(sys.argv[1]) < 5 or len(sys.argv[1]) > 6:
	 print "Enter correct language isocode along with country code."
         print "Check /usr/share/iso-codes/iso_639.tab for language code"
         print "e.g. For Hindi language use hi_IN or for Nepali language use ne_NP"
         exit()
       else:
         self.lng = sys.argv[1]
 
       self.dirname = sys.argv[2]
       if not os.path.exists(self.dirname):
         print "Enter full directory path where text files exists "
         exit()

       print "Creating dictionary for language \"%s\"" %self.lng + " using text data in directory \"%s\"" %self.dirname
       self.Create(self.dirname)
 
    def read_file(self,filename):
	 file = codecs.open(filename, "r", "utf-8" )
 	 fOut= codecs.open(wordsfile,"w","utf-8")

	 u = file.readlines()
	 u = [x.strip() for x in u]
	 for word in u:
	   if word.startswith('msgstr "'): 
	    wl=word[8:-1].split(' ')
	   else:
	    wl=word.split(' ')    
	   nSentences = len(wl) 
	   if nSentences > 1:
	     for w in wl:
	       if len(w) > 1 :
	        if w.find('%',1) < 1:
	         fOut.write(w+'\n')
           else:
            fOut.write(word+'\n')
	 fOut.close()

    def extract_words(self):
	 f = open(wordsfile)
	 content = f.read()
	
	 noline = re.sub('(\n)+', ' ', content)
	 line_pat = re.sub('[.!:?]', '\n', noline)
	 line_pat2 = re.sub("[,\-']", ' ', line_pat)
	 line_pat3 = re.sub("[,';:+{}=/@&*!~`#$^|_?<>]", ' ', line_pat2)
	 line_pat4 = re.sub('"', '', line_pat3)
	 line_pat5 = re.sub('\\\\', ' ', line_pat4)
	 line_pat6 = re.sub('\[', '', line_pat5)
	 line_pat7 = re.sub('\]', '', line_pat6)
	 line_pat8 = re.sub("[()%1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]",'', line_pat7)

	 sentences = line_pat8.split('\n')
	 nSentences = len(sentences)
	 #print 'The text has ' + str(nSentences) + ' sentences.'
	
	 words = []
	 for s in sentences: 
	    words = words + s.split(' ')             
	    words = [w for w in words if len(w) > 0]

	 nWords = len(words) 
	 #print 'The text has a total of ' + str(nWords) + ' words in it.'
	
	 fOut= codecs.open(wordstmpfile,"w","utf-8")
	 wDict = {} 
	 for w in words:
	    if wDict.has_key(w):
	        wDict[w] += 1
	    else:
	        wDict[w] = 1
	    fOut.write(w.decode('utf-8')+'\n')
	 fOut.close()
	 #print 'There are a total of ' + str(len(wDict)) + ' words in this text'

    def remove_dups(self):
	 f = codecs.open(wordstmpfile,"r","utf-8" )
         f2 = codecs.open(dicfile, "w","utf-8")
	 uniquelines = set(f.read().split("\n"))
	 f2.write("".join([line + "\n" for line in uniquelines]))
	 f2.close()


    def Create(self,dirname):
	tmpname="/tmp/words.dic"
	alltext="/tmp/" + "%s" %self.lng + ".dat"
        fOut= codecs.open(alltext,"w+","utf-8")

        print "00%....Creating Text Data to Parse"
	for root, dirs, files in os.walk(dirname, topdown=False):
   	  for fname in files:
           file = codecs.open(os.path.join(dirname,fname), "r", "utf-8" )
           fOut.write(file.read())
           file.close()
        print "25%....Reading Text Data to Parse"
        self.read_file(alltext)
        print "50%....Created Text Data to Parse"
	self.extract_words()
        print "65%....Extracted words from input Text Data"
        self.remove_dups()
        print "80%....Removed duplicated words from extracted wordlist"
	cmd = "/usr/bin/wordlist2hunspell %s " %tmpname + "%s " %self.lng
	os.system(cmd);
        print "......in current directory"
        exit()  

if __name__ == "__main__":
    WordXt()
    main()
