1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- import jieba
- import sys
- import csv
- # the pinyin library
- from pypinyin import pinyin, lazy_pinyin, Style
- from googletrans import Translator
- translator = Translator()
- fileName = str(sys.argv[1])
- theUnit = str(sys.argv[2])
- print ('the Unit : ', theUnit)
- outPutFileName = fileName.replace('hanzi.txt','words.csv')
- print ('Filename : ', fileName, '**', 'outPutFileName : ', outPutFileName)
- with open(outPutFileName,'w') as t:
- print('output csv file opened')
- fieldNames = ['Pinyin','Hanzi','English','module','unit','course']
- csvwriter = csv.DictWriter(t, fieldnames=fieldNames,delimiter='\t')
- #
- myFile = open(fileName, 'r')
- theLines = myFile.readlines()
- lineNumber = 0
- for l in theLines:
- finalRow = {}
- # print (l)
- lineNumber = lineNumber + 1
- print ('LineNumber : ', lineNumber)
- segments = jieba.cut(l, cut_all=True)
- for x in segments:
- if ((x in ' ;,。!:、?') or (x == '') or (x == '\n')):
- pass
- else:
- if (x != ''):
- finalRow['Hanzi'] = x
- temp = pinyin(x)
- finalRow['Pinyin'] = (''.join(str(x) for x in temp)).replace("'",'').replace("]",'').replace("[",'')
- finalRow['English'] = translator.translate(x).text
- finalRow['module'] = 9
- finalRow['unit'] = theUnit
- finalRow['course'] = 'FSI-Chinese'
- if (finalRow != ''):
- csvwriter.writerow(finalRow)
- myFile.close()
- t.close()
|