# Program to complete the grammar CSV (tab separated) file with audio and translations # Licence: MIT # Copyrights : Eric Streit 2022 # adding pinyin to the file # adding end of line (1-1-FSI-Chinese) # dumping all Hanzi parts into a hanzi.txt file # Traitement du Module 7 de FSI CHinese # # colonnes originales: # # entrée: # lettre pinyinOriginal hanziOriginal anglais # # sortie: # lettre pinyinOriginal hanziCorrige anglais module lesson origine # # fichier avec le pinyin en 2 lignes pour comparer # # * Corriger la ponctuation des colonnes 5 (!;,.?) et espaces dans les ponctuations # * transformer les hanziOriginal en pinyin # * Ajouter chanps module lesson origine # parameters: # * name of the file to deal with # * name of the unit (to be added at the end of each line) # 5 original fields: number, English1, hanzi1,English2,hanzi2 # the CSV library import csv from typing import TYPE_CHECKING, Generator # the library to split the Chinese sentences into words import jieba # the pinyin library from pypinyin import pinyin, lazy_pinyin, Style from xpinyin import Pinyin from opencc import OpenCC # hanzi tokenizer from chinese import ChineseAnalyzer # os module import os # shutils (moving files) import shutil # random import random,copy, re, sys # google TTS from GoogleTTS import GoogleTTS # delay import time # theFileName = '' # the file name to work with theFileName = str(sys.argv[1]) print ('theFileName : ',theFileName) theFileNamePrefix = os.path.dirname(theFileName) print ('theFileNamePrefix : ',theFileNamePrefix) theUnit = str(sys.argv[2]) print ('the Unit : ', theUnit) theModule = 7 print (theModule) theOrigin = "FSI-Chinese" print (theOrigin) theNumber = theFileName.split('.')[0][-1] print ('the number : ', theNumber) theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-dialogue-'+ theNumber + '.simp.csv') # the output filename according to the input filename print ('theOutputFileName : ',theOutputFileName) # baseDirectory = os.getcwd() print (' Base directory : ', baseDirectory) # le fichier de comparaison thePinyinFile = os.path.join(theFileNamePrefix, 'dialogue-' + theNumber + '.txt') # initialisation du tokenizer Hanzi analyzer = ChineseAnalyzer() # initialisation du convertisseur Hanzi Pinyin hanConvert = Pinyin() #initialisation de opencc cc = OpenCC('t2s') with open(theOutputFileName,'w') as o: fieldNames = ['lettre', 'pinyinOriginal', 'hanziCorrige', 'anglais', 'module', 'lesson', 'origine'] csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t') with open(thePinyinFile,'w') as t: print('pinyin comparaison text file opened') with open(theFileName,'r') as f: print('csv file opened') inputFieldsNames = ['lettre', 'pinyinOriginal', 'hanziOriginal', 'anglais'] data = csv.DictReader(f, delimiter='\t',fieldnames=inputFieldsNames) finalRow = {} theLineNumber = 0 for row in data: theLineNumber = theLineNumber + 1 print ('theLineNumber : ',theLineNumber) # Traitement du numéro finalRow['lettre'] = row['lettre'] finalRow['pinyinOriginal'] = row['pinyinOriginal'] # opencc : converted = cc.convert(to_convert) # transformation de traditional à simplified hanziOriginal = cc.convert(row['hanziOriginal']) # correction du hanzi theHanziCorrige = hanziOriginal.replace('.','。').replace('?','?').replace(',',',').replace('!','!') # on separe les phrases en mots result = analyzer.parse(theHanziCorrige) tabHanzi = result.tokens() hanzitokenized = ' '.join(str(x) for x in tabHanzi) finalRow['hanziCorrige'] = theHanziCorrige # generation du pinyin à partir du hanziCorrige tokénisé thePinyinGenere = hanConvert.get_pinyin(hanzitokenized, '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize() finalRow['anglais'] = row['anglais'] finalRow['module'] = theModule finalRow['lesson'] = theUnit finalRow['origine'] = theOrigin csvwriter.writerow(finalRow) # writing the pinyin comparaison file t.write(finalRow['pinyinOriginal'] + '\n') t.write(thePinyinGenere + '\n\n') f.close() t.close() o.close()