If you want to create a file of over 10mb in Drive, you can’t use either the DriveApp service or the Advanced Drive service. Even if you use the JSON API, the restriction of 10mb on POST body sizes in UrlFetch stops you doing that too.
If the file is for your own use, then you can zip it right there in Apps Script – we’ll come to that later, but if the file needs to stay as is, then you are pretty much screwed.
Unless you use resumable uploads, and do the thing in chunks. The problem is that these are kind of complicated – so I’ve added a function to my cUseful library to do it for you.
Here’s an example of reading a 50mb file, doing something with it, and writing it back again.
function makeBigQueryFileFromDrive() { // get the file questions var file = DriveApp.getFileById("0B92ExLh4POiZbXB0X2RDWXZqLTA"); // get the data var data = JSON.parse (file.getBlob().getDataAsString()); // write it out in a way that bigQuery can deal with var newData = data.map(function(d) { return JSON.stringify(d); }).join("\n"); // this file would be too big, so need to use drive api // DriveApp.getFiles() // provoke a drive token var name = 'bigQueryJeopardy.txt'; var status = cUseful.FetchUtils .setService(UrlFetchApp) .resumableUpload ( ScriptApp.getOAuthToken(), Utilities.newBlob(newData, MimeType.PLAIN_TEXT, name), file.getParents().next().getId() ); return; }
Walkthrough.
Get some data from Drive and play around with it. This case converts a JSON file to delimited JSON for input to BigQuery (see Loading large JSON datasets into BigQuery with Apps Script for how to do this directly with Apps Script without creating an intermediate file).
// get the file questions var file = DriveApp.getFileById("0B92ExLh4POiZbXB0X2RDWXZqLTA"); // get the data var data = JSON.parse (file.getBlob().getDataAsString()); // write it out in a way that bigQuery can deal with var newData = data.map(function(d) { return JSON.stringify(d); }).join("\n");
Writing the large file
There’s not much to this at all
// this file would be too big, so need to use drive api var name = 'bigQueryJeopardy.txt'; var status = cUseful.FetchUtils .setService(UrlFetchApp) .resumableUpload ( ScriptApp.getOAuthToken(), Utilities.newBlob(newData, MimeType.PLAIN_TEXT, name), file.getParents().next().getId() );
-
- Because the cUseful library is dependency free (no services are referenced directly in it to avoid unnecessary requests for auhtorization), you need to pass over UrlFetchApp
.setService(UrlFetchApp)
-
- You need to have enabled a Drive scopes token by mentioning DriveApp in your code .. even in a comment, and enabled the Drive API in the developer console for this project.
- The data should be passed as a blob with mimetype and filename included in it.
- In this example I wanted the output file to be in the same folder as the source file. If you leave out this argument, it will go in the top level.
.resumableUpload ( ScriptApp.getOAuthToken(), Utilities.newBlob(newData, MimeType.PLAIN_TEXT, name), file.getParents().next().getId() );
Using zip
You can zip your file to make it smaller. You can still use it with resumable uploads, even if it can write the whole thing in one go. Here’s the example modified, so that it zips the content first, then unzips it and checked it worked.
function makeBigQueryFileFromDrive() { // get the file questions var file = DriveApp.getFileById("0B92ExLh4POiZbXB0X2RDWXZqLTA"); // get the data var data = JSON.parse (file.getBlob().getDataAsString()); // write it out in a way that bigQuery can deal with var newData = data.map(function(d) { return JSON.stringify(d); }).join("\n"); var name = 'big.txt'; var blob = Utilities.newBlob(newData, MimeType.PLAIN_TEXT, name); // you can zip this if necessary by adding this line. var zip = Utilities.zip([blob],'smaller.zip'); var status = cUseful.FetchUtils .setService(UrlFetchApp) .resumableUpload ( ScriptApp.getOAuthToken(), zip || blob, file.getParents().next().getId() ); // check it worked var writtenBlob = DriveApp.getFileById(status.resource.id).getBlob(); // if its a zip then unzip it var content = (writtenBlob.getContentType() === MimeType.ZIP ? Utilities.unzip(writtenBlob)[0] : writtenBlob).getDataAsString(); // check the content matches Logger.log ( content === newData ? 'all was good' : 'data didnt match'); return; }
It still needed to do this in 2 chunks, but at least the file written was only 12 mb rather than the original 50+mb. Here’s the log
smaller.zip is 66% complete (chunk transfer rate 563978 bytes per second) for chunk 1 smaller.zip(0B92ExLh4POiZMG1JMG53eWxXV3M) is finished uploading 12797540 bytes in 2 chunks (overall transfer rate 567695 bytes per second) all was good
Reporting on progress
You can provide a function as a 4th argument to resumableUpload which will be called after each chunk is loaded. If you dont provide a function, this default one will be called, which logs progress with Logger. If you return true from this function it will cancel the upload.
var chunkFunction = func || function ( status) { // you can replace this function with your own. // it gets called after each chunk // do something on completion if (status.done) { Logger.log ( status.resource.title + ' is finished uploading ' + status.content.length + ' bytes in ' + (status.index+1) + ' chunks ' + ' (transfer rate ' + Math.round(content.length*1000/(new Date().getTime() - status.startTime)) + ' bytes per second)' ); } // do something on successful completion of a chunk else if (status.success) { Logger.log ( status.resource.title + ' is ' + Math.round(status.ratio*100) + '% complete ' + ' (transfer rate ' + Math.round(status.size*1000/(new Date().getTime() - status.startChunkTime)) + ' bytes per second' + ' after ' + (status.index+1) + ' chunks ' ); } // decide what to do on an error else if (response.getResponseCode() === 503 ) { throw 'error 503 - you can try restarting using ' + status.location; } // its some real error else { throw response.getContentText() + ' you might be able to restart using ' + location; } // if you want to cancel return true return false; };
This progress function is passed a whole bunch of things that can be used for reporting or restarting.
/** * @param {object} status the status of the transfer * status.start the start position in the content just processed * status.size the size of the chunk * status.index the index number (0 base) of the chunk * status.location the restartable url * status.content the total content * status.response the httr response of this attempt * status.success whether this worked (see response.getResponseCode() for more * status.done whether its all done successfully * status.ratio ratio complete * status.resource the file resource * status.startTime timestamp of when it all started * status.startChunkTime timestamp of when this chunk started * @return {boolean} whether to cancel (true means cancel the upload) */
This status object is also returned from the resumable upload function.
Here’s what the default progress report looks like
bigQueryJeopardy.txt is 16% complete (transfer rate 692587 bytes per second after 1 chunks bigQueryJeopardy.txt is 32% complete (transfer rate 779321 bytes per second after 2 chunks bigQueryJeopardy.txt is 48% complete (transfer rate 805977 bytes per second after 3 chunks bigQueryJeopardy.txt is 64% complete (transfer rate 795808 bytes per second after 4 chunks bigQueryJeopardy.txt is 80% complete (transfer rate 822171 bytes per second after 5 chunks bigQueryJeopardy.txt is 96% complete (transfer rate 778959 bytes per second after 6 chunks bigQueryJeopardy.txt is finished uploading 52508726 bytes in 7 chunks (transfer rate 766752 bytes per second)
Restarting an interrupted load.
If you run into quota problems and want to continue with the upload at a later time, you can use the contents in the status object to do that at a later date. Here’s how –
cUseful.FetchUtils.setService(UtlFetchApp).resumeUpload (accessToken,blob,location,start,func);
where
- accessToken:the access token to use
- blob: a blob with the total content, filename and mimetype . The content is available in status.content, or you can recreate it for the resumption
- location. This is the restart url. it can be found in status.location
- start. The resume start position in the contents. This would be status.start + status.size
- func. The optional function to monitor progress. If missing the default one will be used.
Defeating the quotas
If you’re not careful you’ll soon run out of quota (100mb a day is all you can have on UrlFetch and if you break that, pretty much nothing will work till tomorrow), so to do this I created another account, shared a directory with it, and ran it from that other account.
The code
The code is in the cUseful library, and below
/** * Utils contains useful functions for working with urlfetchapp * @namespace FetchUtils */ var FetchUtils = (function (ns) { /** * to keep this namespace dependency free * this must be run to set the driveappservice before it can be used * @param {fetchapp} dap the fetchapp * @return {FetchUtils} self */ ns.setService = function (dap) { ns.service = dap; return ns; }; ns.checkService = function () { if(!ns.service) { throw 'please do a FetchUtils.setService (yoururlfetchapp) to inialise namespace'; } }; /** * restart a resumable upload * @param {string} accessToken the token * @param {blob} contblobent the content * @param {string} location the location url * @param {string} start the start position * @param {function} [func] a func to call after each chunk * @return {object} the status from the last request */ ns.resumeUpload = function (accessToken,blob,location,start,func) { var MAXPOSTSIZE = 1024*1024*8; ns.checkService(); //get the content and make the resource var content = blob.getBytes(); var file = { title: blob.getName(), mimeType:blob.getContentType() }; var chunkFunction = func || function ( status) { // you can replace this function with your own. // it gets called after each chunk // do something on completion if (status.done) { Logger.log ( status.resource.title + '(' + status.resource.id + ')' + '\n' + ' is finished uploading ' + status.content.length + ' bytes in ' + (status.index+1) + ' chunks ' + ' (overall transfer rate ' + Math.round(content.length*1000/(new Date().getTime() - status.startTime)) + ' bytes per second)' ); } // do something on successful completion of a chunk else if (status.success) { Logger.log ( status.resource.title + ' is ' + Math.round(status.ratio*100) + '% complete ' + ' (chunk transfer rate ' + Math.round(status.size*1000/(new Date().getTime() - status.startChunkTime)) + ' bytes per second)' + ' for chunk ' + (status.index+1) ); } // decide what to do on an error else if (response.getResponseCode() === 503 ) { throw 'error 503 - you can try restarting using ' + status.location; } // its some real error else { throw response.getContentText() + ' you might be able to restart using ' + location; } // if you want to cancel return true return false; }; var startTime = new Date().getTime(); // now do the chunks var pos = 0, index = 0 ; do { // do it in bits var startChunkTime = new Date().getTime(); var chunk = content.slice (pos , Math.min(pos+MAXPOSTSIZE, content.length)); var options = { contentType:blob.getContentType(), method:"PUT", muteHttpExceptions:true, headers: { "Authorization":"Bearer " + accessToken, "Content-Range": "bytes "+pos+"-"+(pos+chunk.length-1)+"/"+content.length } }; // load this chunk of data options.payload = chunk; // now we can send the file // but .... UrlFetch failed because too much upload bandwidth was used var response = Utils.expBackoff (function () { return ns.service.fetch (location, options) ; }); // the actual data size transferred var size = chunk.length; if (response.getResponseCode() === 308 ) { var ranges = response.getHeaders().Range.split('=')[1].split('-'); var size = parseInt (ranges[1],10) - pos + 1; if (size !== chunk.length ) { Logger.log ('chunk length mismatch - sent:' + chunk.length + ' but confirmed:' + size + ':recovering by resending the difference'); } }; // catch the file id if (!file.id) { try { file.id = JSON.parse(response.getContentText()).id; } catch (err) { // this is just in case the contenttext is not a proper object } } var status = { start:pos, size:size, index:index, location:location, response:response, content:content, success:response.getResponseCode() === 200 || response.getResponseCode() === 308, done:response.getResponseCode() === 200, ratio:(size + pos) / content.length, resource:file, startTime:startTime, startChunkTime:startChunkTime }; index++; pos += size; // now call the chunk completefunction var cancel = chunkFunction ( status ); } while ( !cancel && status.success && !status.done); return status; }; /** * resumable upload * @param {string} accessToken an accesstoken with Drive scope * @param {blob} blob containg the data, type and name * @param {string} [folderId] the folderId to be the parent * @param {function} func a func to call after each chunk * @return {object} the status from the last request */ ns.resumableUpload = function (accessToken,blob,folderId,func) { ns.checkService(); /** * @param {object} status the status of the transfer * status.start the start position in the content just processed * status.size the size of the chunk * status.index the index number (0 base) of the chunk * status.location the restartable url * status.content the total content * status.response the httr response of this attempt * status.success whether this worked (see response.getResponseCode() for more * status.done whether its all done successfully * status.ratio ratio complete * status.resource the file resource * status.startTime timestamp of when it all started * status.startChunkTime timestamp of when this chunk started * @return {boolean} whether to cancel (true means cancel the upload) */ //get the content and make the resource var content = blob.getBytes(); var file = { title: blob.getName(), mimeType:blob.getContentType() }; // assign to a folder if given if (folderId) { file.parents = [{id:folderId}]; } // this sends the metadata and gets back a url var resourceBody = JSON.stringify(file); var headers = { "X-Upload-Content-Type":blob.getContentType(), "X-Upload-Content-Length":content.length , "Authorization":"Bearer " + accessToken, }; var response = Utils.expBackoff( function () { return ns.service.fetch ("https://www.googleapis.com/upload/drive/v2/files?uploadType=resumable", { headers:headers, method:"POST", muteHttpExceptions:true, payload:resourceBody, contentType: "application/json; charset=UTF-8", contentLengthxx:resourceBody.length }); }); if (response.getResponseCode() !== 200) { throw 'failed on initial upload ' + response.getResponseCode(); } // get the resume location var location = getLocation (response); return ns.resumeUpload (accessToken,blob,location,0,func); function getLocation (resp) { if(resp.getResponseCode()!== 200) { throw 'failed in setting up resumable upload ' + resp.getContentText(); } // the location we need comes back as a header var location = resp.getHeaders().Location; if (!location) { throw 'failed to get location for resumable uploade'; } return location; } }; return ns; })(FetchUtils || {});