mod9.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. # Program to complete the grammar CSV (tab separated) file with audio and translations
  2. # Licence: MIT
  3. # Copyrights : Eric Streit <eric@yojik.eu> 2022
  4. # adding pinyin to the file
  5. # adding end of line (1-1-FSI-Chinese)
  6. # dumping all Hanzi parts into a hanzi.txt file
  7. # Traitement du Module 9 de FSI CHinese
  8. # * Corriger la ponctuation des colonnes 2 et 5 (!;,.?) et espaces dans les ponctuations
  9. # * séparer les phrases hanzi en mots (ajouter champ hanzytokenized)
  10. # * transformer ces mots en pinyin (ajouter pinyinWords)
  11. # * renommer pinyin en pinyinSyllabe
  12. # * Ajouter majuscule premier caractère de pinyin dans pinyin
  13. # * Supprimer ponctuation dans pinyinSyllabe
  14. # * générer audio
  15. # parameters:
  16. # * name of the file to deal with
  17. # * name of the unit (to be added at the end of each line)
  18. # 5 original fields: number, English1, hanzi1,English2,hanzi2
  19. # the CSV library
  20. import csv
  21. from typing import TYPE_CHECKING, Generator
  22. # the library to split the Chinese sentences into words
  23. import jieba
  24. # the pinyin library
  25. from pypinyin import pinyin, lazy_pinyin, Style
  26. from xpinyin import Pinyin
  27. # the hanzipi library for decomposing, finding definitions and examples
  28. # hanzi tokenizer
  29. from chinese import ChineseAnalyzer
  30. # os module
  31. import os
  32. # shutils (moving files)
  33. import shutil
  34. # random
  35. import random,copy, re, sys
  36. # google TTS
  37. from GoogleTTS import GoogleTTS
  38. # delay
  39. import time
  40. #
  41. theFileName = '' # the file name to work with
  42. theFileName = str(sys.argv[1])
  43. print ('theFileName : ',theFileName)
  44. theFileNamePrefix = os.path.dirname(theFileName)
  45. print ('theFileNamePrefix : ',theFileNamePrefix)
  46. theHanziFile = os.path.join(theFileNamePrefix, 'hanzi.txt')
  47. theUnit = str(sys.argv[2])
  48. print ('the Unit : ', theUnit)
  49. theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.csv') # the output filename according to the input filename
  50. print ('theOutputFileName : ',theOutputFileName)
  51. endOfLine = '\t9\t' + theUnit + '\tFSI-Chinese'
  52. print ('the endOfLine : ', endOfLine)
  53. #
  54. baseDirectory = os.getcwd()
  55. print (' Base directory : ', baseDirectory)
  56. # initialisation du tokenizer Hanzi
  57. analyzer = ChineseAnalyzer()
  58. # initialisation du convertisseur Hanzi Pinyin
  59. hanConvert = Pinyin()
  60. with open(theOutputFileName,'w') as o:
  61. fieldNames = ['numero','pinyin1','pinyinSyllabe1','hanzi1','hanzitokenized1','English1','pinyin2','pinyinSyllabe2','hanzi2','hanzitokenized2','English2','module','unit','course']
  62. csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
  63. with open(theHanziFile,'w') as t:
  64. print('text file opened')
  65. with open(theFileName,'r') as f:
  66. print('csv file opened')
  67. data = csv.DictReader(f, delimiter='\t')
  68. # see the names of the fields above
  69. finalRow = {}
  70. theLineNumber = 0
  71. for row in data:
  72. theLineNumber = theLineNumber + 1
  73. print ('theLineNumber : ',theLineNumber)
  74. # Traitement du numéro
  75. finalRow['numero'] = row['number']
  76. # Traitement du Hanzi normal
  77. # print (row['hanzi1'])
  78. # Sauvegarde du Hanzi dans un fichier texte
  79. t.write(row['hanzi1'] + '\n')
  80. t.write(row['hanzi2'] + '\n')
  81. # construction du CSV
  82. finalRow['hanzi1'] = row['hanzi1']
  83. finalRow['hanzi2'] = row['hanzi2']
  84. # Traitement du Hanzi tokenized
  85. # hanzi1
  86. result = analyzer.parse(row['hanzi1'])
  87. tabHanzi = result.tokens()
  88. finalRow['hanzitokenized1'] = ' '.join(str(x) for x in tabHanzi)
  89. finalRow['hanzitokenized1'] = finalRow['hanzitokenized1'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!')
  90. # hanzi2
  91. result = analyzer.parse(row['hanzi2'])
  92. tabHanzi = result.tokens()
  93. finalRow['hanzitokenized2'] = ' '.join(str(x) for x in tabHanzi)
  94. finalRow['hanzitokenized2'] = finalRow['hanzitokenized2'].replace(' 。','。').replace(' , ',',').replace(' ?','?').replace(' !','!')
  95. # Traitement du Pinyin
  96. # pinyin1
  97. temp = pinyin(row['hanzi1'])
  98. finalRow['pinyin1'] = hanConvert.get_pinyin(finalRow['hanzitokenized1'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
  99. # print ('Temp : ', temp)
  100. # pinyinsyllabe
  101. thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
  102. thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','')
  103. finalRow['pinyinSyllabe1'] = thePinyinSentence
  104. # pinyin2
  105. temp = pinyin(row['hanzi2'])
  106. # print ('Temp : ', temp)
  107. finalRow['pinyin2'] = hanConvert.get_pinyin(finalRow['hanzitokenized2'], '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
  108. # pinyinsyllabe
  109. thePinyinSentence = ' '.join(str(x) for x in temp).replace("'",'').replace("]",'').replace("[",'')
  110. thePinyinSentence = thePinyinSentence.replace('.','').replace('.','').replace('?','').replace('!','').replace(',','').replace('。','').replace('?','').replace('!','').replace(',','')
  111. finalRow['pinyinSyllabe2'] = thePinyinSentence
  112. # le reste des champs
  113. finalRow['English1'] = row['English1']
  114. finalRow['English2'] = row['English2']
  115. finalRow['module'] = 9
  116. finalRow['unit'] = int(theUnit)
  117. finalRow['course'] = 'FSI-Chinese'
  118. csvwriter.writerow(finalRow)
  119. f.close()
  120. t.close()
  121. o.close()