eric
/
FSI-Chinese


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							# Program to complete the grammar CSV (tab separated) file with audio and translations
# Licence: MIT
# Copyrights : Eric Streit <eric@yojik.eu> 2022

# adding pinyin to the file
# adding end of line (1-1-FSI-Chinese)
# dumping all Hanzi parts into a hanzi.txt file

# Traitement du Module 7 de FSI CHinese
#
# colonnes originales:
#
# entrée:
# numero lettre pinyinOriginal hanziOriginal anglais
#
# sortie:
# numero lettre pinyinOriginal pinyinGenere hanziOriginal  hanziCorrige anglais module lesson origine
#
# * Corriger la ponctuation des colonnes 5 (!;,.?) et espaces dans les ponctuations
# * transformer les hanziOriginal en pinyin 
# * Ajouter chanps module lesson origine

# parameters:
# * name of the file to deal with
# * name of the unit (to be added at the end of each line)

# 5 original fields: number, English1, hanzi1,English2,hanzi2

# the CSV library
import csv
from typing import TYPE_CHECKING, Generator
# the library to split the Chinese sentences into words
import jieba
# the pinyin library
from pypinyin import pinyin, lazy_pinyin, Style
from xpinyin import Pinyin

from opencc import OpenCC

# hanzi tokenizer
from chinese import ChineseAnalyzer

# os module
import os
# shutils (moving files)
import shutil
# random
import random,copy, re, sys

# google TTS
from GoogleTTS import GoogleTTS
# delay
import time

#
theFileName = '' # the file name to work with

theFileName = str(sys.argv[1])
print ('theFileName : ',theFileName)

theFileNamePrefix = os.path.dirname(theFileName)
print ('theFileNamePrefix : ',theFileNamePrefix)

theUnit = str(sys.argv[2])
print ('the Unit : ', theUnit)

theModule = 7
print (theModule)

theOrigin = "FSI-Chinese"
print (theOrigin)

theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.simp.csv') # the output filename according to the input filename
print ('theOutputFileName : ',theOutputFileName)

#
baseDirectory = os.getcwd()
print (' Base directory : ', baseDirectory)

# le fichier de comparaison
thePinyinFile = os.path.join(theFileNamePrefix, 'pinyin.txt')

# initialisation du tokenizer Hanzi
analyzer = ChineseAnalyzer()
# initialisation du convertisseur Hanzi Pinyin
hanConvert = Pinyin()
#initialisation de opencc
cc = OpenCC('t2s')

with open(theOutputFileName,'w') as o:
    fieldNames = ['numero', 'lettre', 'pinyinOriginal','hanziCorrige', 'anglais', 'module', 'lesson', 'origine']
    csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')

    with open(thePinyinFile,'w') as t:
        print('pinyin comparaison text file opened')

        with open(theFileName,'r') as f:
                print('csv file opened')
                inputFieldsNames = ['numero', 'lettre', 'pinyinOriginal', 'hanziOriginal',  'anglais']
                data = csv.DictReader(f, delimiter='\t',fieldnames=inputFieldsNames)
                
                finalRow = {}
                theLineNumber = 0
                
                for row in data:
                    theLineNumber = theLineNumber + 1
                    print ('theLineNumber : ',theLineNumber)
                    # Traitement du numéro
                    finalRow['numero'] = row['numero']
                    finalRow['lettre'] = row['lettre']
                    finalRow['pinyinOriginal'] = row['pinyinOriginal']
                    # opencc : converted = cc.convert(to_convert)
                    # transformation de traditional à simplified
                    hanziOriginal = cc.convert(row['hanziOriginal'])
                    # correction du hanzi
                    theHanziCorrige = hanziOriginal.replace('.','。').replace('?','？').replace(',','，').replace('!','！')
                    # on separe les phrases en mots
                    result = analyzer.parse(theHanziCorrige)
                    tabHanzi = result.tokens()
                    hanzitokenized = ' '.join(str(x) for x in tabHanzi)
                    finalRow['hanziCorrige'] = theHanziCorrige
                    # generation du pinyin à partir du hanziCorrige tokénisé
                    thePinyinGenere = hanConvert.get_pinyin(hanzitokenized, '', tone_marks='marks').replace('。','.').replace('，',', ').replace('？','?').replace('！','!').capitalize()

                    # finalRow['pinyinGenere'] = thePinyinGenere
                    
                    finalRow['anglais'] = row['anglais']
                    finalRow['module'] = theModule
                    finalRow['lesson'] = theUnit
                    finalRow['origine'] = theOrigin
                    csvwriter.writerow(finalRow) 
                    # writing the pinyin comparaison file
                    t.write(finalRow['pinyinOriginal'] + '\n')
                    t.write(thePinyinGenere + '\n\n')

f.close()
t.close()
o.close()