Parser.php 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. <?php
  2. namespace CcCedict;
  3. use \Exception;
  4. use \SplFileObject;
  5. /**
  6. * Class for parsing the CC-CEDICT dictionary
  7. *
  8. * @package CcCedict
  9. */
  10. class Parser
  11. {
  12. /**
  13. * path/filename to the CC-CEDICT data
  14. *
  15. * @var string
  16. */
  17. private $filePath;
  18. /**
  19. * options for Entry report
  20. *
  21. * @var array
  22. */
  23. private $options = [];
  24. /**
  25. * size of the block the parser will read at a time
  26. *
  27. * @var int
  28. */
  29. private $blockSize = 50;
  30. /**
  31. * Where to start reading the file
  32. *
  33. * @var int
  34. */
  35. private $startLine = 0;
  36. /**
  37. * How many blocks to read in total
  38. *
  39. * @var float
  40. */
  41. private $numberOfBlocks = INF;
  42. /**
  43. * Sets the path/filename containing the raw uncompressed CC-CEDICT data
  44. *
  45. * @param string $filePath
  46. */
  47. public function setFilePath($filePath)
  48. {
  49. $this->filePath = $filePath;
  50. }
  51. /**
  52. * set options with which to configure the report from the Entry object
  53. *
  54. * @param array $options
  55. */
  56. public function setOptions(array $options)
  57. {
  58. $this->options = $options;
  59. }
  60. /**
  61. * sets the size of the block the parser should read at a time
  62. *
  63. * @param int $blockSize
  64. */
  65. public function setBlockSize(int $blockSize = 50)
  66. {
  67. $this->blockSize = $blockSize;
  68. }
  69. /**
  70. * sets the line number where the parser will start reading. 0-based.
  71. *
  72. * @param int $startLine
  73. */
  74. public function setStartLine(int $startLine = 0)
  75. {
  76. $this->startLine = $startLine;
  77. }
  78. /**
  79. * sets the number of blocks that the parser will read in total
  80. *
  81. * @param float $numberOfBlocks
  82. */
  83. public function setNumberOfBlocks(float $numberOfBlocks = INF)
  84. {
  85. $this->numberOfBlocks = $numberOfBlocks;
  86. }
  87. /**
  88. * Reads a block of size $blockSize from the file, separates any meta-data,
  89. * and yields an array with Entry objects, any skipped lines, and counts
  90. *
  91. * @throws Exception
  92. */
  93. public function parse()
  94. {
  95. $blockSize = $this->blockSize;
  96. $startLine = $this->startLine;
  97. $blocks = $this->numberOfBlocks;
  98. $file = new SplFileObject($this->filePath);
  99. if ($file) {
  100. $blocksRead = 0;
  101. while (!$file->eof() && $blocksRead < $blocks) {
  102. $parsedLines = [];
  103. $skippedLines = [];
  104. // move pointer to next block
  105. $file->seek($startLine + ($blocksRead * $blockSize));
  106. // If EOF was reached in the while-loop above, would that abort the for loop below?
  107. // I'm guessing not, so we need to check for EOF again in the for-loop.
  108. for ($i = 0; !$file->eof() && $i < $blockSize; $i++) {
  109. $line = trim($file->current());
  110. if ($line !== '' || strpos($line, '#') !== 0) {
  111. $parsedLine = $this->parseLine($line);
  112. if ($parsedLine) {
  113. $parsedLines[] = $parsedLine;
  114. } else {
  115. $skippedLines[] = $line;
  116. }
  117. }
  118. $file->next();
  119. }
  120. $blocksRead++;
  121. yield [
  122. 'parsedLines' => $parsedLines,
  123. 'skippedLines' => $skippedLines,
  124. 'numSkipped' => count($skippedLines),
  125. 'numParsed' => count($parsedLines),
  126. ];
  127. }
  128. } else {
  129. throw new Exception('Could not open file for parsing: ' . $this->filePath);
  130. }
  131. }
  132. /**
  133. * parses a single line from the file, checking to see it meets basic dictionary spec
  134. *
  135. * @param string $line A line from the CC-CEDICT file
  136. *
  137. * @return false|array
  138. */
  139. private function parseLine($line)
  140. {
  141. $line = trim($line);
  142. // Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/
  143. // 中國 中国 [Zhong1 guo2] /China/Middle Kingdom/
  144. if (preg_match('#(.+) (.+) \[(.+)\] /(.*)/#', $line, $match)) {
  145. $entry = new Entry();
  146. $entry->setData($match);
  147. if (count($this->options)) {
  148. return $entry->getOptional($this->options);
  149. } else {
  150. return $entry->getBasic();
  151. }
  152. } else {
  153. return false;
  154. }
  155. }
  156. }