Zipf.Java History

Hide minor edits - Show changes to output

Changed line 28 from:
* @author Chris Wagner and Bill Manaris (based on VB code by Chuck McCormick and Bill Manaris )
to:
* @author Chris Wagner and Bill Manaris (based on VB code by Chuck McCormick and Bill Manaris)
Changed line 41 from:
private double r2; // Fit of trendline
to:
private double r2; // Fit of trendline
Added lines 1-318:
(:source lang=Java tabwidth=3 -trim :)

package nevmuse.utilities;

import java.util.Arrays;

/**
* This class contains non-static and static
* methods that may be used to calculate
* the slope and r2 (fit) of a trendline
* of a Zipf distribution (byRank or bySize).
* <br>
* <br>The byRank distribution plots the values (y-axis)
* against the ranks of the values from largest to smallest
* (x-axis) in log-log scale. The ranks are generated automatically.
* <br>
* <br>The bySize distribution plots the values (y-axis)
* against the supplised keys (x-axis) in log-log scale.
* <br>
* <br>
* <b>Usage: </b> Create a Zipf object, add keys and/or values, then
* run either the bySize() or byRank() method, and finally get the values
* of slope and r2.
* <br>
* <br>You may use the static methods with no object instantiation.
* <br>These return a double array containing the slope and r2 values.
*
* @author Chris Wagner and Bill Manaris (based on VB code by Chuck McCormick and Bill Manaris )
* @version 1.1 (July 30, 2005)
* @version 1.0 (May 10, 2003)
*/

public class Zipf
{
/**
* The natural log of 10
*/
public static final double LN_10 = 2.3025850929940456840179914546844;

private double slope; // Slope of trendline
private double r2; // Fit of trendline
private double[] keys; // Collection of keys for the values
private double[] vals; // Collection of values

/**
* Simple constructor.
*/
public Zipf()
{
slope = r2 =0.0;
keys = new double[0];
vals = new double[0];
}

/**
* It gets the calculated slope (does not re-calculate it every time).
*/
public double getSlope() { return slope; }

/**
* It gets the calculated R^2 value (does not re-calculate it every time).
*/
public double getR2() { return r2; }

/**
* It gets the keys (x-axis) used to calculate the Zipf distribution.
*/
public double[] getKeys() { return keys; }

/**
* It gets the values (y-axis) used to calculate the Zipf distribution.
*/
public double[] getVals() { return vals; }

/**
* It sets the keys (x-axis) used to calculate the Zipf distribution.
*/
public void setKeys(double[] newKeys)
{
assert (newKeys.length > 0) :
"Keys array should have at least one element.";

// check for negative values in newKeys
for (int i = 0; i < newKeys.length; i++)
assert (newKeys[i] >= 0) :
"Keys array should not contain negative values.";

// things are fine, so process newKeys values
// reduce newKeys to non-zero entries
int nonZeroEntriesCounter = 0;
for (int i = 0; i < newKeys.length; i++)
if (newKeys[i] != 0)
nonZeroEntriesCounter++;

// create compact histogram
double compactNewKeys[] = new double[nonZeroEntriesCounter];

// populate compact newKeys
int j = 0; // compact newKeys index
for (int i = 0; i < newKeys.length; i++)
if (newKeys[i] != 0)
compactNewKeys[j++] = newKeys[i];

this.keys = compactNewKeys; // store the set of keys in the corresponding instance variable
}

/**
* It sets the values (y-axis) used to calculate the Zipf distribution.
*/
public void setVals(double[] histogram)
{
assert (histogram.length > 0) :
"Histogram array has no elements.";

// check for negative values in histogram
for (int i = 0; i < histogram.length; i++)
assert (histogram[i] >= 0) :
"Histogram array contains negative values.";

// things are fine, so process histogram values
// reduce histogram to non-zero entries
int nonZeroEntriesCounter = 0;
for (int i = 0; i < histogram.length; i++)
if (histogram[i] != 0)
nonZeroEntriesCounter++;

// create compact histogram
double compactHistogram[] = new double[nonZeroEntriesCounter];

// populate compact histogram
int j = 0; // compact histogram index
for (int i = 0; i < histogram.length; i++)
if (histogram[i] != 0)
compactHistogram[j++] = histogram[i];

this.vals = compactHistogram; // store set of values to corresponding instance variable
}

/**
* Calculate the slope and R^2 of this object's values. Keys are ignored.
*
* @see byRank(double[] vals)
*/
public void byRank() {
double[] sr2 = byRank(vals);
slope = sr2[0];
r2 = sr2[1];
}

/**
* Calculate the slope and R^2 of this object's values using this object's keys.
*
* @see bySize(double[] keys, double[] vals)
*/
public void bySize() {
double[] sr2 = bySize(keys, vals);
slope = sr2[0];
r2 = sr2[1];
}

/**
* Calculate the slope and R^2 of the values, ignoring key values and sorting the values
* in descending order.
*
* @param vals The values to whose Zipf distribution we wish to calculate.
* @return An array of doubles (slope is stored in index 0 and R^2 (fit) in index 1).
*/
public static double[] byRank(double[] vals) {
// Step 1: Sort the vals.
// Copy vals so sort doesn't alter the original.
double[] nvals = new double[vals.length];
for(int i = 0; i < vals.length; i++) nvals[i] = vals[i];
Arrays.sort(nvals);

// Step 2: Create keys with rank values.
double[] keys = new double[vals.length];
for(int i = 0; i < vals.length; i++)
keys[i] = vals.length - i;

// Step 3: Get Zipf Slope and R2 of keys/values.
return getSlopeR2(keys, nvals);
}

/**
* Calculate the slope and r2 of the values without ordering.
* Keys contains the desired 'ranking'.
*
* @param keys The keys (x-axis).
* @param vals The values (y-axis).
*
* @return An array of doubles (slope is stored in index 0 and R^2 (fit) in index 1).
*/
public static double[] bySize(double[] keys, double[] vals)
{
assert (keys.length == vals.length) :
"Parallel arrays for keys and values do not have the same length.";

/* *** System.out.print(" keys[]: ");
for (int i = 0; i < keys.length; i++)
System.out.print(keys[i] + " ");
System.out.println();

System.out.print(" vals[]: ");
for (int i = 0; i < vals.length; i++)
System.out.print(vals[i] + " ");
System.out.println();*/

// NOTE: There is no need to sort the parallel arrays of keys and vals, since
// getSlopeR2() does not care if the keys are sorted in any particular order;
// it cares only that the association between keys[i] and vals[i] is correct.

return getSlopeR2(keys, vals);
}

/**
* Add (append) a key/value pair to the set.
*
* @param key Key of this value.
* @param val Value for the key.
*/
public void addKeyValue(double key, double val) {
// Step 1: Make new keys and vals arrays.
double[] newKeys = new double[keys.length + 1];
double[] newVals = new double[vals.length + 1];

// Step 2: Copy the old arrays.
for(int i = 0; i < keys.length; i++) {
newKeys[i] = keys[i];
newVals[i] = vals[i];
}

// Step 3: Add the new values.
newKeys[keys.length] = key;
newVals[vals.length] = val;

// Step 4: Use the new arrays.
keys = newKeys;
vals = newVals;
}

/**
* Add (append) a value to the set. It is given a key of zero.
*
* @param val Value to add.
*/
public void addValue(double val) {
addKeyValue(0, val);
}

/*******************************
* SUPPORTING METHODS
*******************************/
/**
* Calculates the Zipf Slope and R^2(fit) of a
* set of keys and values.
* If slope and/or R^2 cannot be calculated, a zero is returned.
*
* @param keys The keys for the set
* @param vals The values for the set
*
* @return An array of doubles (slope is stored in index 0 and R^2 (fit) in index 1).
*/
private static double[] getSlopeR2(double[] keys, double[] vals) {
// Log10(keys) is mapped to the X axis.
// Log10(vals) is mapped to the Y axis.
double sumX = 0; // Sum of X values.
double sumY = 0; // Sum of Y values.
double sumXY = 0; // Sum of X*Y values.
double sumX2 = 0; // Sum of X*X values.
double sumY2 = 0; // Sum of Y*Y values.
double[] sr2 = new double[2]; // The return slope and r2.

// Sum up the values for the calculations.
for(int i = 0; i < keys.length; i++) {
sumX += log10(keys[i]);
sumY += log10(vals[i]);
sumXY += log10(keys[i]) * log10(vals[i]);
sumX2 += log10(keys[i]) * log10(keys[i]);
sumY2 += log10(vals[i]) * log10(vals[i]);
}

// Set the slope.
if((keys.length * sumX2 - sumX * sumX) == 0)
sr2[0] = 0;
else
sr2[0] = ((keys.length * sumXY - sumX * sumY) / (keys.length * sumX2 - sumX * sumX));

// If you want to create the line: y = mx + b
// m = slope
// This calculates b.
// double b = (sumY - sr2[0] * sumX) / keys.length;

// Set the R2.
if(Math.sqrt((keys.length * sumX2 - sumX * sumX) * (keys.length * sumY2 - sumY * sumY)) == 0)
sr2[1] = 0;
else
sr2[1] = (keys.length * sumXY - sumX * sumY) / Math.sqrt((keys.length * sumX2 - sumX * sumX) * (keys.length * sumY2 - sumY * sumY));
sr2[1] = sr2[1] * sr2[1];

// Return the result.
return sr2;
}

/**
* Calculates the Log base 10 of a number.
* This is required because Math.log is not Log(10) but Ln (natural Log).
*
* Note: Log(b) n = Ln n / Ln b.
*
* @param n The original number.
* @return Log(10) n
*/
private static double log10(double n) {
return Math.log(n)/LN_10;
}
}

(:sourcend:)