Entry.php 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. <?php
  2. namespace CcCedict;
  3. use \Exception;
  4. /**
  5. * Class to represent an Entry in the CC-CEDICT dictionary
  6. *
  7. * @package CcCedict
  8. */
  9. class Entry
  10. {
  11. const F_ORIGINAL = 'original';
  12. const F_TRADITIONAL = 'traditional';
  13. const F_SIMPLIFIED = 'simplified';
  14. const F_PINYIN = 'pinyin';
  15. const F_PINYIN_NUMERIC = 'pinyinNumeric';
  16. const F_PINYIN_NUMERIC_EXPANDED = 'pinyinNumericExpanded';
  17. const F_PINYIN_DIACRITIC = 'pinyinDiacritic';
  18. const F_PINYIN_DIACRITIC_EXPANDED = 'pinyinDiacriticExpanded';
  19. const F_ENGLISH = 'english';
  20. const F_ENGLISH_EXPANDED = 'englishExpanded';
  21. const F_TRADITIONAL_CHARS = 'traditionalChars';
  22. const F_SIMPLIFIED_CHARS = 'simplifiedChars';
  23. /**
  24. * the original data about one entry
  25. *
  26. * @var array
  27. */
  28. private $dataOriginal;
  29. /**
  30. * the data prepared for out about one entry
  31. *
  32. * @var array
  33. */
  34. private $dataOutput;
  35. /**
  36. * sets the data values from the parser's match data
  37. *
  38. * @param array $match
  39. */
  40. public function setData($match)
  41. {
  42. $this->dataOriginal = $match;
  43. }
  44. /**
  45. * gets a basic report of the entry content
  46. *
  47. * @return array
  48. */
  49. public function getBasic(): array
  50. {
  51. $this->dataOutput[self::F_ENGLISH_EXPANDED] = $this->resolveOption(self::F_ENGLISH_EXPANDED);
  52. $this->dataOutput[self::F_TRADITIONAL_CHARS] = $this->resolveOption(self::F_TRADITIONAL_CHARS);
  53. $this->dataOutput[self::F_SIMPLIFIED_CHARS] = $this->resolveOption(self::F_SIMPLIFIED_CHARS);
  54. return $this->dataOutput;
  55. }
  56. /**
  57. * gets a report of the entry content, featuring specified fields
  58. *
  59. * @param array $options Fields we want to see in the report, referenced by
  60. * class constants, e.g. Entry::F_ORIGINAL
  61. *
  62. * @return array
  63. */
  64. public function getOptional(array $options): array
  65. {
  66. foreach ($options as $option) {
  67. $this->dataOutput[$option] = $this->resolveOption($option);
  68. }
  69. return $this->dataOutput;
  70. }
  71. /**
  72. * gets a full report of the entry content
  73. *
  74. * @return array
  75. */
  76. public function getFull(): array
  77. {
  78. $this->getBasic();
  79. $this->dataOutput[self::F_PINYIN_NUMERIC] = $this->resolveOption(self::F_PINYIN_NUMERIC);
  80. $this->dataOutput[self::F_PINYIN_DIACRITIC] = $this->resolveOption(self::F_PINYIN_DIACRITIC);
  81. return $this->dataOutput;
  82. }
  83. /**
  84. * gets data for given option
  85. *
  86. * @param string $option The option we want
  87. *
  88. * @return mixed The data for the named option
  89. *
  90. * @throws Exception
  91. */
  92. private function resolveOption($option)
  93. {
  94. switch ($option) {
  95. case self::F_ORIGINAL:
  96. return $this->dataOriginal[0];
  97. break;
  98. case self::F_TRADITIONAL:
  99. return $this->dataOriginal[1];
  100. break;
  101. case self::F_SIMPLIFIED:
  102. return $this->dataOriginal[2];
  103. break;
  104. case self::F_PINYIN:
  105. return $this->dataOriginal[3];
  106. break;
  107. case self::F_ENGLISH:
  108. return $this->dataOriginal[4];
  109. break;
  110. case self::F_TRADITIONAL_CHARS:
  111. return $this->extractChineseChars($this->resolveOption(self::F_TRADITIONAL));
  112. break;
  113. case self::F_SIMPLIFIED_CHARS:
  114. return $this->extractChineseChars($this->resolveOption(self::F_SIMPLIFIED));
  115. break;
  116. case self::F_PINYIN_NUMERIC:
  117. return $this->convertToPinyinNumeric($this->resolveOption(self::F_PINYIN));
  118. break;
  119. case self::F_PINYIN_NUMERIC_EXPANDED:
  120. return explode(' ', $this->resolveOption(self::F_PINYIN_NUMERIC));
  121. break;
  122. case self::F_PINYIN_DIACRITIC:
  123. return $this->convertToPinyinDiacritic($this->resolveOption(self::F_PINYIN));
  124. break;
  125. case self::F_PINYIN_DIACRITIC_EXPANDED:
  126. return explode(' ', $this->resolveOption(self::F_PINYIN_DIACRITIC));
  127. break;
  128. case self::F_ENGLISH_EXPANDED:
  129. return explode('/', $this->dataOriginal[4]);
  130. break;
  131. default:
  132. throw new Exception('Unknown option: ' . $option);
  133. }
  134. }
  135. /**
  136. * extracts the Chinese characters
  137. *
  138. * @param string $chinese String with Chinese characters in it
  139. *
  140. * @return array
  141. */
  142. private function extractChineseChars($chinese): array
  143. {
  144. // below regex script catches all Chinese characters, also those that
  145. // are outside the everyday spectrum (such as Suzhou numerals or rare
  146. // variants). This makes sense for the dictionary, \p{Lo} didn't quite cut it.
  147. preg_match_all('#[\p{Han}]#u', $chinese, $matches);
  148. return $matches[0];
  149. }
  150. /**
  151. * Converts the CC-CEDICT pinyin to more familar numeric pinyin
  152. *
  153. * CC-CEDICT Pinyin formatting info: https://cc-cedict.org/wiki/format:syntax
  154. *
  155. * This deals with idiocyncracies in CC-CEDICT pinyin where:
  156. * lu:4 => lü4
  157. * xian4 r5 => xianr4
  158. *
  159. * @param string $pinyin
  160. *
  161. * @return string
  162. */
  163. private function convertToPinyinNumeric($pinyin): string
  164. {
  165. // $pinyin = 'xian4 r5 lu:3 lu:3 r5'; // for testing purposes
  166. // I'm not sure how these option thingies work so I'm not going to introduce
  167. // a set of new ones to define the numeric style, but you may call them below.
  168. $relevantOptionName = true;
  169. if ($relevantOptionName) {
  170. $pinyin = str_replace(['u:','U:'], ['ü','Ü'], $pinyin);
  171. }
  172. if ($relevantOptionName) {
  173. $pinyin = preg_replace('/(\d) r5/', 'r$1', $pinyin);
  174. }
  175. return $pinyin;
  176. }
  177. /**
  178. * Converts the CC-CEDICT pinyin to accented/diacritic-marked pinyin
  179. *
  180. * Pinyin diacritic placement rules: http://pinyin.info/rules/where.html
  181. * CC-CEDICT pinyin formatting info: https://cc-cedict.org/wiki/format:syntax
  182. *
  183. * @param string $pinyin
  184. *
  185. * @return string
  186. */
  187. private function convertToPinyinDiacritic($pinyin): string
  188. {
  189. // $pinyin = "nu:3 er2 lu:5 er4 bing1 liao3 zhuang2 V gou3"; // for testing purposes only
  190. // allowed vowels in pinyin.
  191. $vowels = ['a', 'e', 'i', 'o', 'u', 'u:', 'A', 'E', 'I', 'O', 'U', 'U:'];
  192. // mapping: tone-vowel to diacritic'd-vowel; keys are tones, values are vowel-mappings
  193. $conversion = [
  194. 1 => array_combine($vowels, ['ā', 'ē', 'ī', 'ō', 'ū', 'ǖ', 'Ā', 'Ē', 'Ī', 'Ō', 'Ū', 'Ǖ']),
  195. 2 => array_combine($vowels, ['á', 'é', 'í', 'ó', 'ú', 'ǘ', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ǘ']),
  196. 3 => array_combine($vowels, ['ǎ', 'ě', 'ǐ', 'ǒ', 'ǔ', 'ǚ', 'Ǎ', 'Ě', 'Ǐ', 'Ǒ', 'Ǔ', 'Ǚ']),
  197. 4 => array_combine($vowels, ['à', 'è', 'ì', 'ò', 'ù', 'ǜ', 'À', 'È', 'Ì', 'Ò', 'Ù', 'Ǜ']),
  198. ];
  199. // explode pinyin string into elements, including pinyins and any punctuation marks
  200. $pinyins = explode(' ', $pinyin);
  201. $returnPinyins = [];
  202. foreach ($pinyins as $pinyin) {
  203. // get tone number from end of pinyin, cast it to integer
  204. // so that any non-numeric values will become 0
  205. $tone = (int)substr($pinyin, -1, 1);
  206. // if there was a valid tone marker (1-5), strip the marker from the pinyin
  207. if ($tone > 0 && $tone < 6) {
  208. $pinyin = substr($pinyin, 0, -1);
  209. // no full conversion needed for pinyin with neutral tone (5)
  210. if ($tone < 5) {
  211. // a, e or the o in ou always get the marker
  212. $toConvertPosition = stripos($pinyin, 'a') ? : stripos($pinyin, 'e') ? : stripos($pinyin, 'ou');
  213. // if no a, e, or ou found, the tone mark goes on the last vowel
  214. if ($toConvertPosition === false) {
  215. for ($i = strlen($pinyin); $i >= 0; $i--) {
  216. if (in_array(substr($pinyin, $i, 1), $vowels)) {
  217. $toConvertPosition = $i;
  218. break;
  219. }
  220. }
  221. }
  222. // if the vowel position is set
  223. if ($toConvertPosition !== false) {
  224. // if the vowel is followed by a :, we need to consider two characters
  225. if (substr($pinyin, $toConvertPosition+1, 1) == ":") {
  226. $toConvert = substr($pinyin, $toConvertPosition, 2);
  227. } else {
  228. $toConvert = substr($pinyin, $toConvertPosition, 1);
  229. }
  230. $returnPinyins[] = str_replace($toConvert, $conversion[$tone][$toConvert], $pinyin);
  231. } else {
  232. $returnPinyins[] = $pinyin;
  233. }
  234. } else {
  235. // u: => ü conversion still required for neutral tones and anything
  236. // anything that was not a pinyin (like a middot or a single char)
  237. $returnPinyins[] = str_replace(['u:', 'U:'], ['ü', 'Ü'], $pinyin);
  238. }
  239. } else {
  240. // simply add items that were not pinyins but rather punctuation marks
  241. // or single char without a tone
  242. $returnPinyins[] = $pinyin;
  243. }
  244. }
  245. if (isset($returnPinyins)) {
  246. return implode(' ', $returnPinyins);
  247. }
  248. // if somehow nothing was set during the above, return error message
  249. return 'No valid elements';
  250. }
  251. }