# Program to complete the grammar CSV (tab separated) file with audio and translations # Licence: MIT # Copyrights : Eric Streit 2022 # adding pinyin to the file # adding end of line (1-1-FSI-Chinese) # dumping all Hanzi parts into a hanzi.txt file # Traitement du Module 9 de FSI CHinese # * Corriger la ponctuation des colonnes 2 et 5 (!;,.?) et espaces dans les ponctuations # * séparer les phrases hanzi en mots (ajouter champ hanzytokenized) # * transformer ces mots en pinyin (ajouter pinyinWords) # * renommer pinyin en pinyinSyllabe # * Ajouter majuscule premier caractère de pinyin dans pinyin # * Supprimer ponctuation dans pinyinSyllabe # * générer audio # parameters: # * name of the file to deal with # * name of the unit (to be added at the end of each line) # 5 original fields: number, English1, hanzi1,English2,hanzi2 # the CSV library import csv from typing import TYPE_CHECKING, Generator # the library to split the Chinese sentences into words import jieba # the pinyin library from pypinyin import pinyin, lazy_pinyin, Style from xpinyin import Pinyin # the hanzipi library for decomposing, finding definitions and examples # hanzi tokenizer from chinese import ChineseAnalyzer # os module import os # shutils (moving files) import shutil # random import random,copy, re, sys # google TTS from GoogleTTS import GoogleTTS # delay import time # theFileName = '' # the file name to work with theFileName = str(sys.argv[1]) print ('theFileName : ',theFileName) theFileNamePrefix = os.path.dirname(theFileName) print ('theFileNamePrefix : ',theFileNamePrefix) theHanziFile = os.path.join(theFileNamePrefix, 'hanzi.txt') theUnit = str(sys.argv[2]) print ('the Unit : ', theUnit) theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.csv') # the output filename according to the input filename print ('theOutputFileName : ',theOutputFileName) endOfLine = '\t9\t' + theUnit + '\tFSI-Chinese' print ('the endOfLine : ', endOfLine) # baseDirectory = os.getcwd() print (' Base directory : ', baseDirectory) # initialisation du tokenizer Hanzi analyzer = ChineseAnalyzer() # initialisation du convertisseur Hanzi Pinyin hanConvert = Pinyin() with open(theOutputFileName,'w') as o: fieldNames = ['numero','pinyin1','pinyinSyllabe1','hanzi1','hanzitokenized1','English1','pinyin2','pinyinSyllabe2','hanzi2','hanzitokenized2','English2','module','unit','course'] csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t') with open(theHanziFile,'w') as t: print('text file opened') with open(theFileName,'r') as f: print('csv file opened') data = csv.DictReader(f, delimiter='\t') # see the names of the fields above finalRow = {} theLineNumber = 0 for row in data: theLineNumber = theLineNumber + 1 print ('theLineNumber : ',theLineNumber) # Traitement du numéro finalRow['numero'] = row['number'] # Traitement du Hanzi normal # print (row['hanzi1']) # Sauvegarde du Hanzi dans un fichier texte t.write(row['hanzi1'] + '\n') t.write(row['hanzi2'] + '\n') # construction du CSV finalRow['hanzi1'] = row['hanzi1'] finalRow['hanzi2'] = row['hanzi2'] # Traitement du Hanzi tokenized # hanzi1 result = analyzer.parse(row['hanzi1']) tabHanzi = result.tokens() finalRow['hanzitokenized1'] = ' '.join(str(x) for x in tabHanzi) finalRow['hanzitokenized1'] = finalRow['hanzitokenized1'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!') # hanzi2 result = analyzer.parse(row['hanzi2']) tabHanzi = result.tokens() finalRow['hanzitokenized2'] = ' '.join(str(x) for x in tabHanzi) finalRow['hanzitokenized2'] = finalRow['hanzitokenized2'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!') # Traitement du Pinyin # pinyin1 temp = pinyin(row['hanzi1']) finalRow['pinyin1'] = hanConvert.get_pinyin(finalRow['hanzitokenized1'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize() # print ('Temp : ', temp) # pinyinsyllabe thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'') thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','') finalRow['pinyinSyllabe1'] = thePinyinSentence # pinyin2 temp = pinyin(row['hanzi2']) # print ('Temp : ', temp) finalRow['pinyin2'] = hanConvert.get_pinyin(finalRow['hanzitokenized2'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize() # pinyinsyllabe thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'') thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','') finalRow['pinyinSyllabe2'] = thePinyinSentence # le reste des champs finalRow['English1'] = row['English1'] finalRow['English2'] = row['English2'] finalRow['module'] = 9 finalRow['unit'] = int(theUnit) finalRow['course'] = 'FSI-Chinese' csvwriter.writerow(finalRow) f.close() t.close() o.close()