123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- # Program to complete the grammar CSV (tab separated) file with audio and translations
- # Licence: MIT
- # Copyrights : Eric Streit <eric@yojik.eu> 2022
- # adding pinyin to the file
- # adding end of line (1-1-FSI-Chinese)
- # dumping all Hanzi parts into a hanzi.txt file
- # Traitement du Module 9 de FSI CHinese
- # * Corriger la ponctuation des colonnes 2 et 5 (!;,.?) et espaces dans les ponctuations
- # * séparer les phrases hanzi en mots (ajouter champ hanzytokenized)
- # * transformer ces mots en pinyin (ajouter pinyinWords)
- # * renommer pinyin en pinyinSyllabe
- # * Ajouter majuscule premier caractère de pinyin dans pinyin
- # * Supprimer ponctuation dans pinyinSyllabe
- # * générer audio
- # parameters:
- # * name of the file to deal with
- # * name of the unit (to be added at the end of each line)
- # 5 original fields: number, English1, hanzi1,English2,hanzi2
- # the CSV library
- import csv
- from typing import TYPE_CHECKING, Generator
- # the library to split the Chinese sentences into words
- import jieba
- # the pinyin library
- from pypinyin import pinyin, lazy_pinyin, Style
- from xpinyin import Pinyin
- # the hanzipi library for decomposing, finding definitions and examples
- # hanzi tokenizer
- from chinese import ChineseAnalyzer
- # os module
- import os
- # shutils (moving files)
- import shutil
- # random
- import random,copy, re, sys
- # google TTS
- from GoogleTTS import GoogleTTS
- # delay
- import time
- #
- theFileName = '' # the file name to work with
- theFileName = str(sys.argv[1])
- print ('theFileName : ',theFileName)
- theFileNamePrefix = os.path.dirname(theFileName)
- print ('theFileNamePrefix : ',theFileNamePrefix)
- theHanziFile = os.path.join(theFileNamePrefix, 'hanzi.txt')
- theUnit = str(sys.argv[2])
- print ('the Unit : ', theUnit)
- theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.csv') # the output filename according to the input filename
- print ('theOutputFileName : ',theOutputFileName)
- endOfLine = '\t9\t' + theUnit + '\tFSI-Chinese'
- print ('the endOfLine : ', endOfLine)
- #
- baseDirectory = os.getcwd()
- print (' Base directory : ', baseDirectory)
- # initialisation du tokenizer Hanzi
- analyzer = ChineseAnalyzer()
- # initialisation du convertisseur Hanzi Pinyin
- hanConvert = Pinyin()
- with open(theOutputFileName,'w') as o:
- fieldNames = ['numero','pinyin1','pinyinSyllabe1','hanzi1','hanzitokenized1','English1','pinyin2','pinyinSyllabe2','hanzi2','hanzitokenized2','English2','module','unit','course']
- csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
- with open(theHanziFile,'w') as t:
- print('text file opened')
- with open(theFileName,'r') as f:
- print('csv file opened')
- data = csv.DictReader(f, delimiter='\t')
-
- # see the names of the fields above
- finalRow = {}
- theLineNumber = 0
-
- for row in data:
- theLineNumber = theLineNumber + 1
- print ('theLineNumber : ',theLineNumber)
- # Traitement du numéro
- finalRow['numero'] = row['number']
-
- # Traitement du Hanzi normal
- # print (row['hanzi1'])
- # Sauvegarde du Hanzi dans un fichier texte
- t.write(row['hanzi1'] + '\n')
- t.write(row['hanzi2'] + '\n')
- # construction du CSV
- finalRow['hanzi1'] = row['hanzi1']
- finalRow['hanzi2'] = row['hanzi2']
-
- # Traitement du Hanzi tokenized
- # hanzi1
- result = analyzer.parse(row['hanzi1'])
- tabHanzi = result.tokens()
- finalRow['hanzitokenized1'] = ' '.join(str(x) for x in tabHanzi)
- finalRow['hanzitokenized1'] = finalRow['hanzitokenized1'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!')
- # hanzi2
- result = analyzer.parse(row['hanzi2'])
- tabHanzi = result.tokens()
- finalRow['hanzitokenized2'] = ' '.join(str(x) for x in tabHanzi)
- finalRow['hanzitokenized2'] = finalRow['hanzitokenized2'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!')
- # Traitement du Pinyin
- # pinyin1
- temp = pinyin(row['hanzi1'])
- finalRow['pinyin1'] = hanConvert.get_pinyin(finalRow['hanzitokenized1'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
- # print ('Temp : ', temp)
- # pinyinsyllabe
-
- thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
- thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','')
- finalRow['pinyinSyllabe1'] = thePinyinSentence
-
- # pinyin2
- temp = pinyin(row['hanzi2'])
- # print ('Temp : ', temp)
- finalRow['pinyin2'] = hanConvert.get_pinyin(finalRow['hanzitokenized2'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
- # pinyinsyllabe
- thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
- thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','')
- finalRow['pinyinSyllabe2'] = thePinyinSentence
-
- # le reste des champs
- finalRow['English1'] = row['English1']
- finalRow['English2'] = row['English2']
- finalRow['module'] = 9
- finalRow['unit'] = int(theUnit)
- finalRow['course'] = 'FSI-Chinese'
- csvwriter.writerow(finalRow)
- f.close()
- t.close()
- o.close()
|