Zipf»Java

Java


package nevmuse.utilities;

import java.util.Arrays;

/**
 * This class contains non-static and static
 * methods that may be used to calculate
 * the slope and r2 (fit) of a trendline
 * of a Zipf distribution (byRank or bySize).
 * <br>
 * <br>The byRank distribution plots the values (y-axis)
 * against the ranks of the values from largest to smallest
 * (x-axis) in log-log scale. The ranks are generated automatically.
 * <br>
 * <br>The bySize distribution plots the values (y-axis)
 * against the supplised keys (x-axis) in log-log scale.
 * <br>
 * <br>
 * <b>Usage: </b> Create a Zipf object, add keys and/or values, then
 * run either the bySize() or byRank() method, and finally get the values
 * of slope and r2.
 * <br>
 * <br>You may use the static methods with no object instantiation.
 * <br>These return a double array containing the slope and r2 values.
 *
 * @author Chris Wagner and Bill Manaris (based on VB code by Chuck McCormick and Bill Manaris)
 * @version 1.1 (July 30, 2005)
 * @version 1.0 (May 10, 2003)
 */


public class Zipf
{
   /**
    * The natural log of 10
    */

   public static final double LN_10 = 2.3025850929940456840179914546844;   

   private double slope;   // Slope of trendline
   private double r2;   // Fit of trendline
   private double[] keys;  // Collection of keys for the values
   private double[] vals;  // Collection of values

   /**
    * Simple constructor.
    */

   public Zipf()
   {
      slope = r2 =0.0;
      keys = new double[0];
      vals = new double[0];
   }

   /**
    * It gets the calculated slope (does not re-calculate it every time).
    */

   public double getSlope() { return slope; }

   /**
    * It gets the calculated R^2 value (does not re-calculate it every time).
    */

   public double getR2() { return r2; }

   /**
    * It gets the keys (x-axis) used to calculate the Zipf distribution.
    */

   public double[] getKeys() { return keys; }

   /**
    * It gets the values (y-axis) used to calculate the Zipf distribution.
    */

   public double[] getVals() { return vals; }

   /**
    * It sets the keys (x-axis) used to calculate the Zipf distribution.
    */

   public void setKeys(double[] newKeys)
   {
      assert (newKeys.length > 0) :
         "Keys array should have at least one element.";

       // check for negative values in newKeys
       for (int i = 0; i < newKeys.length; i++)
           assert (newKeys[i] >= 0) :
              "Keys array should not contain negative values.";

       // things are fine, so process newKeys values
       // reduce newKeys to non-zero entries
       int nonZeroEntriesCounter = 0;
       for (int i = 0; i < newKeys.length; i++)
          if (newKeys[i] != 0)
             nonZeroEntriesCounter++;

       // create compact histogram
       double compactNewKeys[] = new double[nonZeroEntriesCounter];

       // populate compact newKeys
       int j = 0// compact newKeys index
       for (int i = 0; i < newKeys.length; i++)
          if (newKeys[i] != 0)
             compactNewKeys[j++] = newKeys[i];

      this.keys = compactNewKeys;      // store the set of keys in the corresponding instance variable
   }

   /**
    * It sets the values (y-axis) used to calculate the Zipf distribution.
    */

   public void setVals(double[] histogram)
   {
      assert (histogram.length > 0) :
         "Histogram array has no elements.";

       // check for negative values in histogram
       for (int i = 0; i < histogram.length; i++)
           assert (histogram[i] >= 0) :
              "Histogram array contains negative values.";

       // things are fine, so process histogram values
       // reduce histogram to non-zero entries
       int nonZeroEntriesCounter = 0;
       for (int i = 0; i < histogram.length; i++)
          if (histogram[i] != 0)
             nonZeroEntriesCounter++;

       // create compact histogram
       double compactHistogram[] = new double[nonZeroEntriesCounter];

       // populate compact histogram
       int j = 0// compact histogram index
       for (int i = 0; i < histogram.length; i++)
          if (histogram[i] != 0)
             compactHistogram[j++] = histogram[i];

      this.vals = compactHistogram;      // store set of values to corresponding instance variable
   }

   /**
    * Calculate the slope and R^2 of this object's values.  Keys are ignored.
    *
    * @see byRank(double[] vals)
    */

   public void byRank() {
      double[] sr2 = byRank(vals);
      slope = sr2[0];
      r2 = sr2[1];
   }

   /**
    * Calculate the slope and R^2 of this object's values using this object's keys.
    *
    * @see bySize(double[] keys, double[] vals)
    */

   public void bySize() {
      double[] sr2 = bySize(keys, vals);
      slope = sr2[0];
      r2 = sr2[1];
   }

   /**
    * Calculate the slope and R^2 of the values, ignoring key values and sorting the values
    * in descending order.
    *
    * @param vals The values to whose Zipf distribution we wish to calculate.
    * @return An array of doubles (slope is stored in index 0 and R^2 (fit) in index 1).
    */

   public static double[] byRank(double[] vals) {
      // Step 1: Sort the vals.
      // Copy vals so sort doesn't alter the original.
      double[] nvals = new double[vals.length];
      for(int i = 0; i < vals.length; i++) nvals[i] = vals[i];
      Arrays.sort(nvals);

      // Step 2: Create keys with rank values.
      double[] keys = new double[vals.length];
      for(int i = 0; i < vals.length; i++)
         keys[i] = vals.length - i;

      // Step 3: Get Zipf Slope and R2 of keys/values.
      return getSlopeR2(keys, nvals);
   }

   /**
    * Calculate the slope and r2 of the values without ordering.
    * Keys contains the desired 'ranking'.
    *
    * @param keys The keys (x-axis).
    * @param vals The values (y-axis).
    *
    * @return An array of doubles (slope is stored in index 0 and R^2 (fit) in index 1).
    */

   public static double[] bySize(double[] keys, double[] vals)
   {
       assert (keys.length == vals.length) :
          "Parallel arrays for keys and values do not have the same length.";

        /* *** System.out.print(" keys[]: ");
        for (int i = 0; i < keys.length; i++)
           System.out.print(keys[i] + " ");
        System.out.println();

        System.out.print(" vals[]: ");
        for (int i = 0; i < vals.length; i++)
           System.out.print(vals[i] + " ");
        System.out.println();*/


       // NOTE:  There is no need to sort the parallel arrays of keys and vals, since
       //        getSlopeR2() does not care if the keys are sorted in any particular order;
       //        it cares only that the association between keys[i] and vals[i] is correct.

      return getSlopeR2(keys, vals);
   }

   /**
    * Add (append) a key/value pair to the set.
    *
    * @param key  Key of this value.
    * @param val  Value for the key.
    */

   public void addKeyValue(double key, double val) {
      // Step 1: Make new keys and vals arrays.
      double[] newKeys = new double[keys.length + 1];
      double[] newVals = new double[vals.length + 1];

      // Step 2: Copy the old arrays.
      for(int i = 0; i < keys.length; i++) {
         newKeys[i] = keys[i];
         newVals[i] = vals[i];
      }

      // Step 3: Add the new values.
      newKeys[keys.length] = key;
      newVals[vals.length] = val;

      // Step 4: Use the new arrays.
      keys = newKeys;
      vals = newVals;
   }

   /**
    * Add (append) a value to the set.  It is given a key of zero.
    *
    * @param val  Value to add.
    */

   public void addValue(double val) {
      addKeyValue(0, val);
   }

   /*******************************
    * SUPPORTING METHODS
    *******************************/

   /**
    * Calculates the Zipf Slope and R^2(fit) of a
    * set of keys and values.
    * If slope and/or R^2 cannot be calculated, a zero is returned.
    *
    * @param keys The keys for the set
    * @param vals The values for the set
    *
    * @return An array of doubles (slope is stored in index 0 and R^2 (fit) in index 1).
    */

   private static double[] getSlopeR2(double[] keys, double[] vals) {
      // Log10(keys) is mapped to the X axis.
      // Log10(vals) is mapped to the Y axis.
      double sumX = 0// Sum of X values.
      double sumY = 0// Sum of Y values.
      double sumXY = 0// Sum of X*Y values.
      double sumX2 = 0// Sum of X*X values.
      double sumY2 = 0// Sum of Y*Y values.
      double[] sr2 = new double[2]// The return slope and r2.

      // Sum up the values for the calculations.
      for(int i = 0; i < keys.length; i++) {
         sumX += log10(keys[i]);
         sumY += log10(vals[i]);
         sumXY += log10(keys[i]) * log10(vals[i]);
         sumX2 += log10(keys[i]) * log10(keys[i]);
         sumY2 += log10(vals[i]) * log10(vals[i]);
      }

      // Set the slope.
      if((keys.length * sumX2 - sumX * sumX) == 0)
         sr2[0] = 0;
      else
         sr2[0] = ((keys.length * sumXY - sumX * sumY) / (keys.length * sumX2 - sumX * sumX));

      // If you want to create the line: y = mx + b
      // m = slope
      // This calculates b.
      // double b = (sumY - sr2[0] * sumX) / keys.length;

      // Set the R2.
      if(Math.sqrt((keys.length * sumX2 - sumX * sumX) * (keys.length * sumY2 - sumY * sumY)) == 0)
         sr2[1] = 0;
      else
         sr2[1] = (keys.length * sumXY - sumX * sumY) / Math.sqrt((keys.length * sumX2 - sumX * sumX) * (keys.length * sumY2 - sumY * sumY));
      sr2[1] = sr2[1] * sr2[1];

      // Return the result.
      return sr2;
   }

   /**
    * Calculates the Log base 10 of a number.
    * This is required because Math.log is not Log(10) but Ln (natural Log).
    *
    * Note: Log(b) n = Ln n / Ln b.
    *
    * @param n The original number.
    * @return Log(10) n
    */

   private static double log10(double n) {
      return Math.log(n)/LN_10;
   }
}