123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # Program to complete the grammar CSV (tab separated) file with audio and translations
- # Licence: MIT
- # Copyrights : Eric Streit <eric@yojik.eu> 2022
- # adding pinyin to the file
- # adding end of line (1-1-FSI-Chinese)
- # dumping all Hanzi parts into a hanzi.txt file
- # parameters:
- # * name of the file to deal with
- # * name of the unit (to be added at the end of each line)
- # 5 original fields: number, English1, Hanzi1,English2,Hanzi2
- # the CSV library
- import csv
- from typing import TYPE_CHECKING, Generator
- # the library to split the Chinese sentences into words
- import jieba
- # the pinyin library
- from pypinyin import pinyin, lazy_pinyin, Style
- # the hanzipi library for decomposing, finding definitions and examples
- # os module
- import os
- # shutils (moving files)
- import shutil
- # random
- import random,copy, re, sys
- # google TTS
- from GoogleTTS import GoogleTTS
- # delay
- import time
- #
- theFileName = '' # the file name to work with
- theFileName = str(sys.argv[1])
- print ('theFileName : ',theFileName)
- theFileNamePrefix = os.path.dirname(theFileName)
- print ('theFileNamePrefix : ',theFileNamePrefix)
- theHanziFile = os.path.join(theFileNamePrefix, 'hanzi.txt')
- theUnit = str(sys.argv[2])
- print ('the Unit : ', theUnit)
- theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.csv') # the output filename according to the input filename
- print ('theOutputFileName : ',theOutputFileName)
- endOfLine = '\t9\t' + theUnit + '\tFSI-Chinese'
- print ('the endOfLine : ', endOfLine)
- #
- baseDirectory = os.getcwd()
- print (' Base directory : ', baseDirectory)
- with open(theOutputFileName,'w') as o:
- fieldNames = ['numero','Pinyin1','Hanzi1','English1','Pinyin2','Hanzi2','English2','module','unit','course']
- csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
- with open(theHanziFile,'w') as t:
- print('text file opened')
- with open(theFileName,'r') as f:
- print('csv file opened')
- data = csv.DictReader(f, delimiter='\t')
-
- # see the names of the fields above
- finalRow = {}
- theLineNumber = 0
- for row in data:
- theLineNumber = theLineNumber + 1
- print ('theLineNumber : ',theLineNumber)
- #print (row['Hanzi1'])
- t.write(row['Hanzi1'] + '\n')
- t.write(row['Hanzi2'] + '\n')
- finalRow['numero'] = row['number']
- temp = pinyin(row['Hanzi1'])
- # print ('Temp : ', temp)
- thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
- finalRow['Pinyin1'] = thePinyinSentence
- finalRow['Hanzi1'] = row['Hanzi1']
- finalRow['English1'] = row['English1']
- temp = pinyin(row['Hanzi2'])
- thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
- finalRow['Pinyin2'] = thePinyinSentence
- finalRow['Hanzi2'] = row['Hanzi2']
- finalRow['English2'] = row['English2']
- finalRow['module'] = 9
- finalRow['unit'] = int(theUnit)
- finalRow['course'] = 'FSI-Chinese'
- csvwriter.writerow(finalRow)
- f.close()
- t.close()
- o.close()
|