Extracting images from a document

If you have a document or book manuscript, you may need to extract and attach figure references to inline images in a document. If you want to give them names associated with the chapter in which they appear, this can end up being more complicated than you’d thought it was going to be.

In this example, I have a large book manuscript with several hundred images I want to extract to Drive with names that reflect which chapter they are in and index them in a spreadsheet.

My starting chapter paragraphs are identified as DocumentApp.ParagraphHeading.HEADING1.

Page Content hide

5 Share with your network

The approach

Using the technique described in Sorting bookmarks in a document, I can identify where all the images and all the paragraphs appear in a document. From this it can easily be deduced which chapter a given image is extracted from.

Code Walkthrough

Setup – it’s a container bound script for the active document.

// where to put the images
  var IMAGE_PATH = '/books/going gas/admin/draft3/images';
  
  // work on the current document
  var doc = DocumentApp.getActiveDocument();
  var body = doc.getBody();

Identify the paragraphs that are the chapter headers and figure out their chapter numbers and position in document.

// get all the chapter heading level parapgraphs
  var chapters = body.getParagraphs().filter(function(d) {
    return d.getHeading() === DocumentApp.ParagraphHeading.HEADING1;
  })
  
  // and filter out those with no chapter heading text
  .filter(function(d) {
    var cs = d.getText().split(" ");
    return cs.length && cs[0] === 'Chapter';
  }) 
  
  // and attach path in document & chapter number
  .map (function(d) {
    var cs = d.getText().split(" ");
    if (cs.length < 2 ) throw 'no chapter number at ' + d.getText().slice(0,20);
    return {element:d , path:pathInDocument (d), chapter:cs[1]};
    
	},[]);

Get all the images and figure out which chapters they are in by comparing their document position with that of each chapter

  // now we can use the path in document to decide whether this image is in this chapter
  var images = body.getImages().map(function(d) {
    var path = pathInDocument (d);
    
    // path of image lies between start of this chapter and beginning of next
    var ch = chapters.filter(function (c,i,a) {
      return path > c.path && (i === a.length-1 || path < a[i+1].path);
    });
    
    // there should be exactly 1 result
    if (ch.length !== 1) {
      throw 'no chapter found for image at path' + path;
    }
    
    // attach chapter object
    return { 
      element:d,
      path:path , 
      paragraph: ch[0]
    };
	});

Index each image by its relative position within its chapter – will be used as the figure number and filename

  // and add the filename (index by 1 for each chapter)
  images.forEach (function(d,i,a) {
    d.name = d.paragraph.chapter.toString() + '.' + a.slice(0,i).reduce(function (p,c) {
      return c.paragraph.chapter === d.paragraph.chapter ? p+1 : p;    
    },1).toString();
	});

Write them all out, converting them to jpg – this could be preceded by deleting all the files in the receiving folder, but I’ve taken that out for safety in case you use this code as is.

// now write them all out
  var folder = getDriveFolderFromPath(IMAGE_PATH);
  
  images.forEach(function(d) {
    var blob = d.element.getBlob().copyBlob().setName (d.name+".jpg").setContentTypeFromExtension();
    folder.createFile (blob);
	});

Record the results to a sheet along with some control information

// index to a sheet
  var sheet = SpreadsheetApp.openById('14xFQAdVKzlkujVZDwk27z-632y4FViBKwz-6Iv-UF3U').getSheetByName('figs');
  // clear the sheet
  sheet.getDataRange().offset(1,0).clearContent();
  // write the new values
  sheet.getRange(2,1,images.length, 4).setValues(images.map(function(d) {
    return [d.name,d.name+".jpg","Screenshot",d.paragraph.element.getText().slice(0,10)];
	}));

Utility function for figuring out position in document

function pathInDocument(element,path) {
    path = path || '';
    var parent = element.getParent();
    if (parent) {
      path = pathInDocument( parent , Utilities.formatString ( '%04d.%s', parent.getChildIndex(element),path  ));
    }
    return path;
	}

Utility function for figuring out Drive folder from folder path name

function getDriveFolderFromPath (path) {
    return (path || "/").split("/").reduce ( function(prev,current) {
      if (prev && current) {
        var fldrs = prev.getFoldersByName(current);
        return fldrs.hasNext() ? fldrs.next() : null;
      }
      else { 
        return current ? null : prev; 
      }
    },DriveApp.getRootFolder()); 
	}

The output

Here’s a clip of the directory with some of the generated images.

and the index sheet.

The code

The whole thing

/**
 * extract all the figures (images) in a document and name them after the chapter they are in.
 */
function extractFigs() {
  
  // where to put the images
  var IMAGE_PATH = '/books/going gas/admin/draft3/images';
  
  // work on the current document
  var doc = DocumentApp.getActiveDocument();
  var body = doc.getBody();
  
  
  // get all the chapter heading level parapgraphs
  var chapters = body.getParagraphs().filter(function(d) {
    return d.getHeading() === DocumentApp.ParagraphHeading.HEADING1;
  })
  
  // and filter out those with no chapter heading text
  .filter(function(d) {
    var cs = d.getText().split(" ");
    return cs.length && cs[0] === 'Chapter';
  }) 
  
  // and attach path in document & chapter number
  .map (function(d) {
    var cs = d.getText().split(" ");
    if (cs.length < 2 ) throw 'no chapter number at ' + d.getText().slice(0,20); return {element:d , path:pathInDocument (d), chapter:cs[1]}; },[]); // now we can use the path in document to decide whether this image is in this chapter var images = body.getImages().map(function(d) { var path = pathInDocument (d); // path of image lies between start of this chapter and beginning of next var ch = chapters.filter(function (c,i,a) { return path > c.path && (i === a.length-1 || path < a[i+1].path);
    });
    
    // there should be exactly 1 result
    if (ch.length !== 1) {
      throw 'no chapter found for image at path' + path;
    }
    
    // attach chapter object
    return { 
      element:d,
      path:path , 
      paragraph: ch[0]
    };
  });
  
  // and add the filename (index by 1 for each chapter)
  images.forEach (function(d,i,a) {
    d.name = d.paragraph.chapter.toString() + '.' + a.slice(0,i).reduce(function (p,c) {
      return c.paragraph.chapter === d.paragraph.chapter ? p+1 : p;    
    },1).toString();
  });
  
  // now write them all out
  var folder = getDriveFolderFromPath(IMAGE_PATH);
  
  images.forEach(function(d) {
    Logger.log(getDriveFolderFromPath(IMAGE_PATH).getName() + '/' + d.name);
    var blob = d.element.getBlob().copyBlob().setName (d.name+".jpg").setContentTypeFromExtension();
    folder.createFile (blob);
  });
  
  // index to a sheet
  var sheet = SpreadsheetApp.openById('14xFQAdVKzlkujVZDwk27z-632y4FViBKwz-6Iv-UF3U').getSheetByName('figs');
  // clear the sheet
  sheet.getDataRange().offset(1,0).clearContent();
  // write the new values
  sheet.getRange(2,1,images.length, 4).setValues(images.map(function(d) {
    return [d.name,d.name+".jpg","Screenshot",d.paragraph.element.getText().slice(0,10)];
  }));
  
  // utitlity functions
  function pathInDocument(element,path) {
    path = path || '';
    var parent = element.getParent();
    if (parent) {
      path = pathInDocument( parent , Utilities.formatString ( '%04d.%s', parent.getChildIndex(element),path  ));
    }
    return path;
  }
  
  function getDriveFolderFromPath (path) {
    return (path || "/").split("/").reduce ( function(prev,current) {
      if (prev && current) {
        var fldrs = prev.getFoldersByName(current);
        return fldrs.hasNext() ? fldrs.next() : null;
      }
      else { 
        return current ? null : prev; 
      }
    },DriveApp.getRootFolder()); 
  }
  }</2></2>

For more like this see Google Apps Scripts Snippets