Matching the site tree to the analytics data

This is also part of each evenings scheduled run. The site is serialized and each analytics entry is matched up to a page on the site. Some of the analytics data has variable quality of site links.

Here's how the pages are matched. Each Url is matched to as deep a level as possible - when the matching stops, it falls back to the last successful match. That way every page can be allocated somewhere on the site within the topic to which it belongs, even though an exact page match is not possible.

// match up the site pages to the analytics pages
function matchAnalytics ( analytics, root, siteUrl, period) {

  // create somewhere to store non-matches
  var base = siteUrl.replace(/\/$/,"");
  
  // for each analytics record, attempt to match it to a site url 
  var x =0;
  analytics.forEach (function(d) {
    var pto = matchToPto (root , d ,base);
    if (!pto) {
      // could not find a good match
      Logger.log('no match for ' + JSON.stringify(d));
    }
    else {
      // found a good match add the counts
      addAnalytics(pto,d,period);
    }
    x+=parseInt(d.pageViews,10);
  });
 
  
  function addAnalytics(pto,d,period){
    // find the period for this item or add it
    var p = pto.analytics[period] ;
    // we have some pageviews for this site page
    p.pageViews += parseInt(d.pageViews,10);
    p.varieties ++;
  }
  
}

function matchToPto( root, data ,base) {
  // match analytics url to page url so far
  var match;
  
  // objective is to find the best match .. continuing to drill down until no more.
  if (root.getUrl() === (base + data.pagePath).slice (0,root.getUrl().length)) {
    match = root;
    // may it matches the children of this guy
    root.children.forEach (function (d) {
      var m = matchToPto (d , data , base);
       // pick the match with the longest url (for example x/y/long would match, but x/y/longer would be better.
      if (match && m && m.getUrl().length > match.getUrl().length) match = m;
    });
  }
  return match;
}


On completion I have a nice tree structure with one of these for each site page.
/**
 * PageTreeObject
 * @constructor
 * @param {PageTreeObject} parent the parent
 * @param {Page} page a Sites Page
 * @return {PageTreeObject} self
 */
function PageTreeObject (parent,page) {
  // one of these for each known page
  var parent_ = parent;
  var page_ = page;
  var topicRoot_ = false;
  var topic_;
  var self = this;
  var url_, pageType_=  PAGETYPES.REGULAR;

  
  this.analytics =[];
  this.children = [];
  this.plusOnes = 0;
  
  this.getParent = function() {
    return parent;
  };
  
  this.getPage = function () {
    return page_;
  };
  
  this.getPageType = function () {
    return pageType_;
  };
   
  this.setPageType = function (pageType) {
    pageType_ = pageType;
  };

  
  this.getTopicRoot = function () {
    return pageType_ === PAGETYPES.TOPICROOT ;
  };
  
  this.setTopic = function (topic) {
    topic_ = topic;
  };
  
  this.getTopic = function () {
    return topic_;
  };
  
  this.getName = function () {
    return page_ ? page_.getName() : null;
  };
  
  this.getUrl = function () {
    if(!url_) {
      url_ = page_ ? page_.getUrl() : null;
    }
    return url_;
  };
  
  this.getTitle = function () {
    return page_ ? page_.getTitle() : null;
  };
  
  
  
  this.express = function () {
   
    return {
      url: self.getUrl(),
      numChildren: self.children.length,
      pageType:self.getPageType(),
      analytics: self.getAnalytics(),
      topicRoot: self.getTopicRoot(),
      name:self.getName(),
      title:self.getTitle(),
      plusOnes:self.plusOnes
    };
  };
  
  this.getAnalytics = function () {
    return this.analytics ;
  }
  this.stringify = function () {
    return JSON.stringify(this.express());
  };
  
  function express (ob) {
      return ob ? { name: ob.getName(), url:ob.getUrl() , title:ob.getTitle()} : null;
  }
}


For help and more information join our forum,follow the blog or follow me on twitter .



Comments