This is the second step in Making sense of Ocr – an orchestration wrapper for the APIS we’ll need. In Google Vision and OCR I showed how to use the vision API to turn a pdf into a series of pages, blocks, paragraphs, and symbols all held together by bounding boxes. In other words, an expression of a map of the image below.
Reading the content
and it is executed like this
1 |
node language --path='a.pdf' --country="GB" |
where the path is the one specifed at ocr time, and the country will be used for default localization (for example country code internaltionalization for phone numbers)
index.js
1 2 3 4 5 6 |
const argv = require('yargs').argv; const languageServer = require('./languageserver'); languageServer.init({ mode: process.env.FIDRUNMODE || 'pv', argv }); |
languageserver.js
1 2 3 4 5 6 7 8 9 10 |
const secrets = require('../private/visecrets'); const languageOrchestrate = require('./languageorchestrate'); // wrapper to run the whole thing const init = async ({ mode, argv }) => { languageOrchestrate.init({ mode }); await languageOrchestrate.start({ mode, argv }); } module.exports = { init }; |
languageorchestrate.js
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
const languageContent = require('./languagecontent'); const languageText = require('./languagetext'); const {getPaths} = require('../common/vihandlers'); // manages orchestration after vision api const init = ({mode}) => { languageContent.init({mode}); languageText.init({mode}); }; const start = async ({mode, argv}) => { // for convenience - there's a common way to get all the file names/paths // the bucket is specified in visecrets and the initial source path here let {path, country: defaultCountry} = argv; // TODO maybe mandate this arg later defaultCountry = defaultCountry || 'GB'; const paths = getPaths({ pathName: path, mode, }); const {gcsContentUri, gcsDataUri} = paths; // now clean up and extract the text from the vision response const gcsTextUri = gcsContentUri; // get the contects from storage that the vision step created const contents = await languageText.getContents({gcsTextUri, mode}); // organize the ocr const cleanCells = await languageText.start({contents, defaultCountry}); // now entity analysis using natural language api const entities = await languageContent.start({ content: cleanCells.map(cell => cell.cleanText).join(','), }); // now associate the mentions back to the original items const assignedEntities = await languageText.assignEntities({ cleanCells, entities, }); // now get more info from google knowledge graph and attach to each emtity const assignedMids = await languageText.getMids({assignedEntities}); // but we'll skip the bounding box info now // finally write the whole thing to storage return Promise.all([ languageText.writeResult({ data: { mode, paths, defaultCountry, content: assignedMids.map(f => { f.texts = f.texts.map(g => { delete g.bb; return g; }); return f; }), }, gcsDataUri, mode, }), ]); }; module.exports = { init, start, }; |
languageText.js
1 2 3 4 |
'EOL_SURE_SPACE', 'SURE_SPACE', 'LINE_BREAK', 'SPACE |
1 |
+1 234 567 890 |
1 |
Sean Connery |
1 |
The Bahamas |
1 |
Actor |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 |
const storageStream = require('../common/storagestream'); const kgSearch = require('../common/kgsearch'); const {joinCellTexts, joinSymbols} = require('../common/viutils'); const {clues} = require('../private/visecrets'); const { checkPhoneNumber, joinPersonName, checkProfession, checkCompany, getFuseProfession, getFuseCompany, checkEmail, checkDate, } = require('../common/vihandlers'); const buildLookups = require('../common/buildlookups'); let lookups = null; // this does all the text fiddling and is the most complex area const getContents = async ({gcsTextUri, mode}) => await storageStream.getFilesContents({ gcsUri: gcsTextUri, mode, }); // write the final result to storage const writeResult = async ({data, gcsDataUri, mode}) => storageStream.streamContent({name: gcsDataUri, content: data, mode}); // get creds for knowledge graph search // note that an api key is needed const init = ({mode}) => { kgSearch.init({mode}); buildLookups.init({mode}); lookups = buildLookups.start({mode}); }; // entry point const start = async ({contents, defaultCountry}) => { // flatten everything - the vision API content is convoluted const cells = makeCells({contents}); // now cluster these into rows const rows = clusterRows({cells}); // now cluster the rows into columns, for tabular presentation if needed const columns = clusterColumns({cells}); // now we have decided how many columns, lets allocate them to actuall cells allocateColumns({columns, rows}); // now attach row and column numbers to the final result const attachedNumbers = attachNumbers({cells, rows, columns}); // we need the result of lookups const [professionLookups, companyLookups] = await lookups; const fuseProfession = getFuseProfession({professionLookups}); const fuseCompany = getFuseCompany({ companyLookups }); // finally clean up the text for use in the natrual language api const cleanCells = clean({ cells: attachedNumbers, defaultCountry, fuseProfession, fuseCompany, }); return cleanCells; }; // just get rid of white space/punctuation/join numbers etc. const clean = ({cells, defaultCountry, fuseProfession, fuseCompany}) => { return cells.map(cell => { const s = joinCellTexts({cell}) .trim() .replace(/\s?['"]\s?/g, '') .replace(/\.$/, ' ') .replace(/[,_\*]/g, ' ') .replace(/\s+/g, ' ') .trim(); // now get rid of anything thats just clearly noise cell.cleanText = clues.nonsense.some(f => s.match(f)) ? '' : s; // do any noise trimming clues.trim.forEach(f => cell.cleanText = cell.cleanText.replace(f, '')); // best checks on content const phone = checkPhoneNumber({text: cell.cleanText, defaultCountry}); const person = joinPersonName({text: cell.cleanText}); const profession = checkProfession({text: cell.cleanText, fuseProfession}); const company = checkCompany({text: cell.cleanText, fuseCompany}); const email = checkEmail({text: cell.cleanText}); const dt = checkDate({text: cell.cleanText}); cell.clues = { mobile: phone.valid && phone.type === 'mobile' ? {...phone, embellished: phone.number.international} : null, phone: phone.valid ? {...phone, embellished: phone.number.international} : null, person: person.likely ? {...person, embellished: person.joined} : null, profession: profession.valid ? {...profession, embellished: profession.searches[0].item.name} : null, email: email.valid ? {...email, embellished: email.text} : null, date: dt.valid ? {...dt, embellished: dt.iso} : null, company: company.valid ? {...company, embellished: company.searches[0].item.name} : null, }; return cell; }); }; // use knowedge graph to get more details const getMids = async ({assignedEntities}) => { // do a single fetch on the knowledge graph with all the mids at once const mids = assignedEntities.reduce((p, cell) => { const mid = cell.entity && cell.entity.metadata && cell.entity.metadata.mid; // dedup and record if (mid && p.indexOf(mid) === -1) p.push(mid); return p; }, []); // TODO - check - is there a max to the number of mids at once - do we need to chunk this? // now we've collected all known mids, attack the knowledge graph const kg = await kgSearch.search({mids}); const midItems = (kg && kg.data.itemListElement) || []; // so now we have data from knowledge graph return assignedEntities.map(cell => { const mid = cell.entity && cell.entity.metadata && cell.entity.metadata.mid; if (mid) { kmid = 'kg:' + mid; // if this is known mid, its possible now attach the definition cell.kg = midItems .filter(f => f['@type'] === 'EntitySearchResult') .map(f => f.result) .find(f => f['@id'] === kmid); } return cell; }); }; // assign entities where they were classified // the entity analysis is disassociated from the original text - have to find it again in the original const assignEntities = ({cleanCells, entities}) => { entities.forEach(entity => { // reassociate with each mention cleanCells.forEach(cell => { if (entity.name === cell.cleanText) { const {metadata, type} = entity; cell.entity = { metadata, type, }; } }); }); return cleanCells; }; // sometimes we get a separator that should be used as if it were a sure_space const PROXY_BREAKS = ['|', ':', '>']; // this means we can get rid of any proxy's from the text const PROXY_DISPOSE = true; // this gets rid of any text thats just blanks const BLANK_DISPOSE = true; /* * although the final result will be thought of as 'row based' * that concept doesnt exist in the initial image * so instead we need to think in terms of cells that can later be linked * by the affinity of their normalized vertices * so there's no point in keeping the original complex layout * just flatten into a collection of cells */ const makeCells = ({contents}) => { return contents.reduce((cells, file, fileIndex) => { const {content} = file; content.responses.forEach((f, index) => { // vsion cant hanfle for than 1000 pages so use that as false spearator const pageIndex = index + fileIndex * 1000; // its already paginated within each response, so there's only 1 page as far as i can tell const {pages} = f.fullTextAnnotation; pages.forEach((c, pIndex) => { if (pIndex) throw 'should only be 1 page per response'; c.blocks.forEach((f, blockIndex) => { // it uens out that there is only 1 paragraph inside a block f.paragraphs.forEach((g, paragraphIndex) => { // a paragraph can be used asa container for columnwise data const parabb = makebb({ nv: g.boundingBox.normalizedVertices, pageIndex, blockIndex, paragraphIndex, pIndex, }); // look through the rows and see where they best fit let cell = null; g.words.forEach((h, wordIndex, arr) => { const key = `${pageIndex}-${pIndex}-${blockIndex}-${paragraphIndex}-${wordIndex}`; const bb = makebb({ nv: h.boundingBox.normalizedVertices, pageIndex, blockIndex, paragraphIndex, wordIndex, pIndex, parabb, key, }); // not doing any further processing at this time, just reorganize into cells const text = joinSymbols({word: h}); // the breaks are used to determine which words go in a cell const breaks = h.symbols .map( t => t.property && t.property.detectedBreak && t.property.detectedBreak.type ) .filter(t => t); // this is a character forced sure space if (PROXY_BREAKS.indexOf(text.slice(-1)) !== -1) { breaks.push('PROXY'); } // intialize a new cell if needed cell = cell || { texts: [], sourceLocation: { pageIndex, blockIndex, paragraphIndex, }, }; // register this one let cleaned = text; if ( PROXY_DISPOSE && breaks.find(t => ['PROXY'].indexOf(t) !== -1) ) { cleaned = cleaned.slice(0, -1); } if (!BLANK_DISPOSE || cleaned.replace(/\s+/, '').length) { cell.texts.push({ text: cleaned, bb, separator: breaks[0] === 'SPACE' ? ' ' : '', }); } // if its the end of a cell then record this as a completed cell if ( breaks.find( t => [ 'EOL_SURE_SPACE', 'SURE_SPACE', 'LINE_BREAK', 'PROXY', ].indexOf(t) !== -1 ) ) { if (cell.texts.length) { cells.push(cell); } cell = null; } }); }); }); }); }); return cells; }, []); }; // just find the bb closest to the middle const closestMiddlebb = (a, b, target) => a && b ? Math.abs(a.bb.ubox.center.middle.y - target.ubox.center.middle.y) < Math.abs(b.bb.ubox.center.middle.y - target.ubox.center.middle.y) ? a : b : a || b; // just find the bb closest to the left const closestLeftbb = (a, b, target) => a && b ? Math.abs(a.bb.raw.topLeft.x - target.raw.topLeft.x) < Math.abs(b.bb.raw.topLeft.x - target.raw.topLeft.x) ? a : b : a || b; /** * This is not a exact science * the objective is to cluster together cells that appear on a similar y axis * and give them the same rowId */ const clusterRows = ({cells}) => { // sort into heights so we do the tallest sections first return ( cells .sort((a, b) => a.texts[0].bb.ubox.height - b.texts[0].bb.ubox.height) .reduce((rows, cell) => { // see if we have one already const {bb} = cell.texts[0]; let row = rows .filter( f => bb.pageIndex === f.bb.pageIndex && bb.ubox.center.middle.y <= f.bb.ubox.center.bottom.y && bb.ubox.center.middle.y >= f.bb.ubox.center.top.y ) // now find the nearest to the center of all rows collected so far .reduce((q, t) => closestMiddlebb(q, t, bb), null); // now row will be pointing to the nearest existing row or will be null // if no match then add it if (!row) { row = { count: 0, cells: [], id: rows.length, bb, }; rows.push(row); } row.cells.push(cell); // keep them sorted in column order for convenience row.cells.sort( (a, b) => a.texts[0].bb.raw.topLeft.x - b.texts[0].bb.raw.topLeft.x ); // cross ref for later cell.rowId = row.id; return rows; }, []) // sort in row order .sort( (a, b) => a.bb.pageIndex + a.bb.ubox.center.middle.y - (b.bb.pageIndex + b.bb.ubox.center.middle.y) ) .map((row, i) => { row.rowNumber = i; return row; }) ); }; const closeColumn = ({bb, rightbb, text, tbb}) => { // this is only approximate as there may have been some cleared away proxy separators const charSize = (rightbb.raw.topRight.x - bb.raw.topLeft.x) / text.length; // allow a tolerance of 2 characters const tolerance = charSize * 2; return Math.abs(bb.raw.topLeft.x - tbb.bb.raw.topLeft.x) <= tolerance; }; /** * * put columns in the best match by row */ const allocateColumns = ({columns, rows}) => { rows.forEach(row => { row.cells.forEach(cell => { const bb = cell.texts[0].bb; const closeColumn = columns.reduce( (q, t) => closestLeftbb(q, t, bb), null ); if (!closeColumn) { console.log( 'failed to find close column for', joinCellTexts({cell}, ' row ', row.rowNumber) ); } else { const t = row.cells.find(f => closeColumn.id === f.columnId); if (t) { console.log( 'dropping cell for', joinCellTexts({cell}), ' row ', row.rowNumber, ' already occupied by ', joinCellTexts({cell: t}) ); // this means it'll get ignored in a sheet render cell.columnId = -1; } else { cell.columnId = closeColumn.id; } } }); }); }; /** * * this is post processing to attach column and row numbers to the cells after all sorting etc */ const attachNumbers = ({cells, rows, columns}) => { // make lookups const cls = columns.reduce((p, c) => { p[c.id] = c.columnNumber; return p; }, []); const rws = rows.reduce((p, c) => { p[c.id] = c.rowNumber; return p; }, []); return cells.map(cell => { cell.rowNumber = rws[cell.rowId]; // this will happen if the data could not be allocated to any column cell.columnNumber = cell.columnId === -1 ? cell.columnId : cls[cell.columnId]; cell.id = cell.rowId * 10000 + cell.columnId; return cell; }); }; /** * This is not a exact science * the objective is to cluster together cells that appear on a similar x axis * the pbig problem here is that the source data is probably not columnized * so we need an arbitrary measure of what 'close' means * I'm giving it 2 characters adjusted for font size */ const clusterColumns = ({cells}) => { // sort into leftmost so leftmost happens first return ( cells .sort( (a, b) => a.texts[0].bb.ubox.topLeft.x - b.texts[0].bb.ubox.topLeft.x ) .reduce((columns, cell) => { // see if we have one already const text = joinCellTexts({cell}); const {bb} = cell.texts[0]; const {bb: rightbb} = cell.texts[cell.texts.length - 1]; let column = columns // only consider boxes it lies in // and allow a little bit of tolerance .filter(tbb => closeColumn({bb, rightbb, text, tbb})) // now find the nearest of all columns collected so far .reduce((q, t) => closestLeftbb(q, t, bb), null); if (!column) { column = { bb, rightbb, id: columns.length, col: bb.raw.topLeft.x, columnId: -1, }; columns.push(column); } return columns; }, []) // sort in column order .sort((a, b) => a.col - b.col) .map((column, i) => { column.columnNumber = i; return column; }) ); }; const makeUbox = ({nv, parabb}) => { // this is the raw bb of the word nv (normalized vertices) const raw = { topLeft: { x: nv[0].x, y: nv[0].y, }, topRight: { x: nv[1].x, y: nv[1].y, }, bottomRight: { x: nv[2].x, y: nv[2].y, }, bottomLeft: { x: nv[3].x, y: nv[3].y, }, }; // but we'll use the paragraph x co-ordinates to line those in the same paragraph up const pbox = { topLeft: { x: parabb ? parabb.raw.topLeft.x : nv[0].x, y: nv[0].y, }, topRight: { x: parabb ? parabb.raw.topRight.x : nv[1].x, y: nv[1].y, }, bottomRight: { x: parabb ? parabb.raw.bottomRight.x : nv[2].x, y: nv[2].y, }, bottomLeft: { x: parabb ? parabb.raw.bottomLeft.x : nv[3].x, y: nv[3].y, }, }; // now create an unskewed parabound - this is what we'll use for most calculations // the normalized vertices seem to adjust for skew - but in case not, max/min top and bottom // to draw a rectangle around the extremities. const {topLeft: pTopLeft} = pbox; const {topLeft, topRight, bottomLeft, bottomRight} = raw; // there's a mixture of paragraph and raw here on purpose. // use the paragraph left edge for left dimensions const ubox = { topLeft: { x: Math.min(pTopLeft.x, topLeft.x), y: Math.min(topLeft.y, topRight.y), }, topRight: { x: Math.max(topRight.x, bottomRight.x), y: Math.min(topLeft.y, topRight.y), }, bottomRight: { x: Math.max(topRight.x, bottomRight.x), y: Math.max(bottomLeft.y, bottomRight.y), }, bottomLeft: { x: Math.min(pTopLeft.x, bottomLeft.x), y: Math.max(bottomLeft.y, bottomRight.y), }, }; // the width/height/center is of the unskewed box ubox.width = ubox.topRight.x - ubox.topLeft.x; ubox.height = ubox.bottomRight.y - ubox.topRight.y; ubox.center = { top: { x: ubox.topLeft.x + ubox.width / 2, y: ubox.topLeft.y, }, middle: { x: ubox.topLeft.x + ubox.width / 2, y: ubox.topLeft.y + ubox.height / 2, }, bottom: { x: ubox.bottomLeft.x + ubox.width / 2, y: ubox.bottomLeft.y, }, }; return { raw, pbox, ubox, nv, }; }; const makebb = ({ nv, pageIndex, blockIndex, paragraphIndex, wordIndex, pIndex, parabb, key, }) => { // so we need to fiddle with the bb box to unskew it const ubox = makeUbox({nv, parabb}); return { key, ...ubox, blockIndex, pageIndex, paragraphIndex, wordIndex, pIndex, parabb, }; }; module.exports = { start, getContents, clean, assignEntities, writeResult, getMids, init, }; |
- languageContent – uses the Google Natural Language API to do some entity analysis on text content. This can return an entity ‘mid’ which is a lookup key to the Google Knowledge Graph if an entity is known – for example “Sean Connery” has an entry in the Knowledge Graph)
Related