eric
/
FSI-Chinese


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
							# Program to complete the grammar CSV (tab separated) file with audio and translations
# Licence: MIT
# Copyrights : Eric Streit <eric@yojik.eu> 2022

# adding pinyin to the file
# adding end of line (1-1-FSI-Chinese)
# dumping all Hanzi parts into a hanzi.txt file

# Traitement du Module 9 de FSI CHinese
# * Corriger la ponctuation des colonnes 2 et 5 (!;,.?) et espaces dans les ponctuations
# * séparer les phrases hanzi en mots (ajouter champ hanzytokenized)
# * transformer ces mots en pinyin (ajouter pinyinWords)
# * renommer pinyin en pinyinSyllabe
# * Ajouter majuscule premier caractère de pinyin dans pinyin
# * Supprimer ponctuation dans pinyinSyllabe
# * générer audio


# parameters:
# * name of the file to deal with
# * name of the unit (to be added at the end of each line)

# 5 original fields: number, English1, hanzi1,English2,hanzi2

# the CSV library
import csv
from typing import TYPE_CHECKING, Generator
# the library to split the Chinese sentences into words
import jieba
# the pinyin library
from pypinyin import pinyin, lazy_pinyin, Style
from xpinyin import Pinyin

# the hanzipi library for decomposing, finding definitions and examples

# hanzi tokenizer
from chinese import ChineseAnalyzer

# os module
import os
# shutils (moving files)
import shutil
# random
import random,copy, re, sys

# google TTS
from GoogleTTS import GoogleTTS
# delay
import time

#
theFileName = '' # the file name to work with


theFileName = str(sys.argv[1])
print ('theFileName : ',theFileName)

theFileNamePrefix = os.path.dirname(theFileName)
print ('theFileNamePrefix : ',theFileNamePrefix)

theHanziFile = os.path.join(theFileNamePrefix, 'hanzi.txt')

theUnit = str(sys.argv[2])
print ('the Unit : ', theUnit)

theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.csv') # the output filename according to the input filename
print ('theOutputFileName : ',theOutputFileName)
endOfLine = '\t9\t' + theUnit + '\tFSI-Chinese'
print ('the endOfLine : ', endOfLine)

#
baseDirectory = os.getcwd()
print (' Base directory : ', baseDirectory)

# initialisation du tokenizer Hanzi
analyzer = ChineseAnalyzer()
# initialisation du convertisseur Hanzi Pinyin
hanConvert = Pinyin()

with open(theOutputFileName,'w') as o:
    fieldNames = ['numero','pinyin1','pinyinSyllabe1','hanzi1','hanzitokenized1','English1','pinyin2','pinyinSyllabe2','hanzi2','hanzitokenized2','English2','module','unit','course']
    csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')

    with open(theHanziFile,'w') as t:
        print('text file opened')

        with open(theFileName,'r') as f:
            print('csv file opened')
            data = csv.DictReader(f, delimiter='\t')
            
            # see the names of the fields above
            finalRow = {}
            theLineNumber = 0
            
            for row in data:
                theLineNumber = theLineNumber + 1
                print ('theLineNumber : ',theLineNumber)
                # Traitement du numéro
                finalRow['numero'] = row['number']
                
                # Traitement du Hanzi normal
                # print (row['hanzi1'])
                # Sauvegarde du Hanzi dans un fichier texte
                t.write(row['hanzi1'] + '\n')
                t.write(row['hanzi2'] + '\n')
                # construction du CSV
                finalRow['hanzi1'] = row['hanzi1']
                finalRow['hanzi2'] = row['hanzi2']
                
                # Traitement du Hanzi tokenized
                # hanzi1
                result = analyzer.parse(row['hanzi1'])
                tabHanzi = result.tokens()
                finalRow['hanzitokenized1'] = ' '.join(str(x) for x in tabHanzi)
                finalRow['hanzitokenized1'] = finalRow['hanzitokenized1'].replace(' 。','。').replace(' ， ','，').replace(' ？','？').replace(' ！','！')
                # hanzi2
                result = analyzer.parse(row['hanzi2'])
                tabHanzi = result.tokens()
                finalRow['hanzitokenized2'] = ' '.join(str(x) for x in tabHanzi)
                finalRow['hanzitokenized2'] = finalRow['hanzitokenized2'].replace(' 。','。').replace(' ， ','，').replace(' ？','？').replace(' ！','！')
                # Traitement du Pinyin
                # pinyin1
                temp = pinyin(row['hanzi1'])
                finalRow['pinyin1'] = hanConvert.get_pinyin(finalRow['hanzitokenized1'], '', tone_marks='marks').replace('。','.').replace('，',', ').replace('？','?').replace('！','!').capitalize()
                # print ('Temp : ', temp)

                # pinyinsyllabe
                
                thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
                thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('？','').replace('！','').replace('，','')
                finalRow['pinyinSyllabe1'] = thePinyinSentence
                
                # pinyin2
                temp = pinyin(row['hanzi2'])
                # print ('Temp : ', temp)
                finalRow['pinyin2'] = hanConvert.get_pinyin(finalRow['hanzitokenized2'], '', tone_marks='marks').replace('。','.').replace('，',', ').replace('？','?').replace('！','!').capitalize()

                # pinyinsyllabe
                thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
                thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('？','').replace('！','').replace('，','')
                finalRow['pinyinSyllabe2'] = thePinyinSentence 
                
                # le reste des champs
                finalRow['English1'] = row['English1']
                finalRow['English2'] = row['English2']
                finalRow['module'] = 9
                finalRow['unit'] = int(theUnit)
                finalRow['course'] = 'FSI-Chinese'
                csvwriter.writerow(finalRow) 

f.close()
t.close()
o.close()