123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- import csv
- from typing import TYPE_CHECKING, Generator
- import jieba
- from pypinyin import pinyin, lazy_pinyin, Style
- from xpinyin import Pinyin
- from chinese import ChineseAnalyzer
- import os
- import shutil
- import random,copy, re, sys
- from GoogleTTS import GoogleTTS
- import time
- theFileName = ''
- theFileName = str(sys.argv[1])
- print ('theFileName : ',theFileName)
- theFileNamePrefix = os.path.dirname(theFileName)
- print ('theFileNamePrefix : ',theFileNamePrefix)
- theHanziFile = os.path.join(theFileNamePrefix, 'hanzi.txt')
- theUnit = str(sys.argv[2])
- print ('the Unit : ', theUnit)
- theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.csv')
- print ('theOutputFileName : ',theOutputFileName)
- endOfLine = '\t9\t' + theUnit + '\tFSI-Chinese'
- print ('the endOfLine : ', endOfLine)
- baseDirectory = os.getcwd()
- print (' Base directory : ', baseDirectory)
- analyzer = ChineseAnalyzer()
- hanConvert = Pinyin()
- with open(theOutputFileName,'w') as o:
- fieldNames = ['numero','pinyin1','pinyinSyllabe1','hanzi1','hanzitokenized1','English1','pinyin2','pinyinSyllabe2','hanzi2','hanzitokenized2','English2','module','unit','course']
- csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
- with open(theHanziFile,'w') as t:
- print('text file opened')
- with open(theFileName,'r') as f:
- print('csv file opened')
- data = csv.DictReader(f, delimiter='\t')
-
-
- finalRow = {}
- theLineNumber = 0
-
- for row in data:
- theLineNumber = theLineNumber + 1
- print ('theLineNumber : ',theLineNumber)
-
- finalRow['numero'] = row['number']
-
-
-
-
- t.write(row['hanzi1'] + '\n')
- t.write(row['hanzi2'] + '\n')
-
- finalRow['hanzi1'] = row['hanzi1']
- finalRow['hanzi2'] = row['hanzi2']
-
-
-
- result = analyzer.parse(row['hanzi1'])
- tabHanzi = result.tokens()
- finalRow['hanzitokenized1'] = ' '.join(str(x) for x in tabHanzi)
- finalRow['hanzitokenized1'] = finalRow['hanzitokenized1'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!')
-
- result = analyzer.parse(row['hanzi2'])
- tabHanzi = result.tokens()
- finalRow['hanzitokenized2'] = ' '.join(str(x) for x in tabHanzi)
- finalRow['hanzitokenized2'] = finalRow['hanzitokenized2'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!')
-
-
- temp = pinyin(row['hanzi1'])
- finalRow['pinyin1'] = hanConvert.get_pinyin(finalRow['hanzitokenized1'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
-
-
-
- thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
- thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','')
- finalRow['pinyinSyllabe1'] = thePinyinSentence
-
-
- temp = pinyin(row['hanzi2'])
-
- finalRow['pinyin2'] = hanConvert.get_pinyin(finalRow['hanzitokenized2'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
-
- thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
- thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','')
- finalRow['pinyinSyllabe2'] = thePinyinSentence
-
-
- finalRow['English1'] = row['English1']
- finalRow['English2'] = row['English2']
- finalRow['module'] = 9
- finalRow['unit'] = int(theUnit)
- finalRow['course'] = 'FSI-Chinese'
- csvwriter.writerow(finalRow)
- f.close()
- t.close()
- o.close()
|