001/*-
002 * Copyright 2016 Diamond Light Source Ltd.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the Eclipse Public License v1.0
006 * which accompanies this distribution, and is available at
007 * http://www.eclipse.org/legal/epl-v10.html
008 */
009
010package org.eclipse.january.dataset;
011
012/**
013 * Estimators of the scale of a Dataset.
014 * <p>
015 * A class of static methods to produce estimations of the scale of variation within a Dataset.
016 * The available estimators are:
017 * <ul>
018 * <li> Median Absolute Deviation </li>
019 * <li> S<sub>n</sub> of Croux and Rousseeuw (1992).</li>
020 * </ul> 
021 * <p>
022 * Croux, C. and P. J. Rousseeuw, "Time-efficient algorithms for two highly robust estimators of scale", Computational Statistics, Volume 1, eds. Y. Dodge and J.Whittaker, Physica-Verlag, Heidelberg, pp411--428 (1992).
023 */
024public class Outliers {
025
026        private final static double MADSCALEFACTOR = 1.4826;
027        private final static double SNSCALEFACTOR = 1.1926;
028        
029        /**
030         * Returns the Median Absolute Deviation (MAD) and the median. 
031         * @param data
032         *                      The data for which the median and the MAD are to be calculated
033         * @return A two-element array of doubles, consisting of the MAD and the median of the data
034         */
035        public static double[] medianAbsoluteDeviation(Dataset data) {
036                
037                double median = (Double)Stats.median(data);
038                data = Maths.subtract(data, median);
039                data = Maths.abs(data);
040                double median2 = (Double)Stats.median(data);
041                double mad = MADSCALEFACTOR * median2;
042                
043                return new double[]{mad, median};
044        }
045        
046        /**
047         * Returns the Sn estimator of Croux and Rousseeuw.
048         * <p>
049         * This is the simple O(n²) version of the calculation algorithm.
050         * @param data
051         *                      The data for which the estimator is to be calculated.
052         * @return The value of the Sn estimator for the data
053         */
054        public static double snNaive(Dataset data) {
055                
056                Dataset medAbs = DatasetFactory.zeros(data);
057                Dataset dif = DatasetFactory.zeros(data);
058                
059                IndexIterator it = data.getIterator();
060                int count = 0;
061                while (it.hasNext()) {
062                        double val = data.getElementDoubleAbs(it.index);
063                        Maths.subtract(data, val, dif);
064                        Maths.abs(dif,dif);
065                        //Lower median - Math.floor((n/2)+1) of sorted
066                        dif.sort(null);
067                        medAbs.setObjectAbs(count++, lowMed(dif));
068                }
069                
070                
071                //Higher median - Math.floor((n+1)/2) of sorted
072                medAbs.sort(null);
073                double median = highMed(medAbs);
074                
075                return median * SNSCALEFACTOR;
076        }
077        
078        /**
079         * Returns the Sn estimator of Croux and Rousseeuw.
080         * <p>
081         * This is the complex O(nlog n) version of the calculation algorithm.
082         * @param data
083         *                      The data for which the estimator is to be calculated.
084         * @return The value of the Sn estimator for the data
085         */
086        public static double snFast(Dataset data) {
087                
088                Dataset sorted = data.clone();
089                sorted.sort(null);
090                
091                Dataset medAbs = DatasetFactory.zeros(data);
092                
093                IndexIterator it = data.getIterator();
094                int count = 0;
095                while (it.hasNext()) {
096                        MedianOfTwoSortedSets snuff = new MedianForSn(sorted, it.index);
097                        medAbs.setObjectAbs(count++, snuff.get());
098                }
099                
100                
101                //Higher median - Math.floor((n+1)/2) of sorted
102                medAbs.sort(null);
103                double median = highMed(medAbs);
104                
105                return median * SNSCALEFACTOR;
106        }
107        
108        /**
109         * Returns the lomed
110         * <p>
111         * Returns the lomed (low median) of a sorted Dataset.
112         * @param data
113         *                      A sorted Dataset for which the low median is to be calculated.  
114         * @return
115         *              The value of the lomed of the data
116         */
117        public static double lowMed(Dataset data) {
118                return data.getElementDoubleAbs((int)Math.floor((data.getSize()/2)));
119        }
120        
121        /**
122         * Returns the himed
123         * <p>
124         * Returns the himed (high median) of a sorted Dataset.
125         * @param data
126         *                      A sorted Dataset for which the low median is to be calculated.  
127         * @return
128         *              The value of the himed of the data
129         */
130        public static double highMed(Dataset data) {
131                return data.getElementDoubleAbs((int)Math.floor((data.getSize()+1)/2-1));
132        }
133
134        /**
135         * Calculates the overall median of two double arrays
136         * @param a first array
137         * @param b second array
138         * @return the overall median of the two arrays
139         */
140        public static double medianOFTwoPrimitiveArrays (double[] a, double[] b) {
141                MedianOfTwoArrays medio = new MedianOfTwoArrays(a, b);
142                return medio.get();
143        }
144}
145
146/**
147 * Allows the calculation of the median of two arrays.
148 * <p>
149 * Subclasses must implement getA() and getB() to return the elements of A or B
150 * at the given index. The length of a must be less than or equal to that of b.
151 * The constructor must set the sizes nA and nB.
152 */
153abstract class MedianOfTwoSortedSets{
154        int nB, nA, diff, diffLeft;
155        
156        public final double get() {
157                // Initialize the left and right markers for the set of candidate 
158                // values. These are inclusive on both left and right.
159                int leftA = 0, leftB = 0;
160                @SuppressWarnings("unused") // keep rightA for symmetry
161                int rightA = nB-1, rightB = nB-1;
162                
163                while (nB > 1) {
164                        // For 0-based indexing, the lomed is the element at floor((n+1)/2)-1
165                        int medianIndex = (int) Math.floor((nB+1)/2)-1;
166                        int medianAIndex = leftA + medianIndex,
167                                        medianBIndex = leftB + medianIndex;
168                        double medA = getAm(medianAIndex),
169                                        medB = getBm(medianBIndex);
170
171                        int smallerShift = 0;
172                        if (nB % 2 == 0) {
173                                // N even: the smaller lomed, as well as anything smaller than it, cannot be the overall median
174                                smallerShift = +1;
175                        }
176                        
177                        if (medA >= medB) {
178                                rightA = medianAIndex;
179                                leftB = medianBIndex + smallerShift;
180                        } else {
181                                rightB = medianBIndex;
182                                leftA = medianAIndex + smallerShift;
183                        }
184                        
185                        // Different lengths
186                        // It should be floor((l_m-1 + 1)/2))
187                        // this is newLength, defined above
188                        // Difference between left and right
189                        nB = rightB - leftB + 1;
190                }
191
192                // when the array is length 1, right and left will be the same.
193                // The lomed of a two element array is the smaller of the two
194                return Math.min(getAm(leftA), getBm(leftB));
195        }
196        
197        // Get the value in the expanded array
198        private double getAm(int i) {
199                int firstElement = diffLeft,
200                                lastElement = diffLeft + nA - 1;
201                if (i < firstElement) {
202                        return Double.NEGATIVE_INFINITY;
203                } else if (i > lastElement) {
204                        return Double.POSITIVE_INFINITY;
205                } else {
206                        return getA(i - diffLeft);
207                }
208        }
209        
210        private double getBm(int i) {
211                return getB(i);
212        }
213
214        // Get the values in the original arrays
215        protected abstract double getA(int i);  
216        protected abstract double getB(int i);
217        
218        // Call this to set up the length difference variables. 
219        protected void setDiffs() {
220                diff = nB - nA;
221                diffLeft = diff/2;
222        }
223}
224
225class MedianOfTwoArrays extends MedianOfTwoSortedSets {
226
227        double[] a, b;
228
229        public MedianOfTwoArrays(double[] ain, double[] bin) {
230                if (bin.length >= ain.length) {
231                        this.a = ain;
232                        nA = ain.length;
233                        this.b = bin;
234                        nB = bin.length;
235                } else {
236                        this.a = bin;
237                        nA = bin.length;
238                        this.b = ain;
239                        nB = ain.length;
240                }
241                setDiffs();
242        }
243        @Override
244        protected double getA(int i) {
245                return a[i];
246        }
247        @Override
248        protected double getB(int i) {
249                return b[i];
250        }
251}
252
253class MedianForSn extends MedianOfTwoSortedSets {
254        Dataset xj;
255        int referenceIndex;
256        boolean lowerIsBigger;
257
258        public MedianForSn(Dataset xj, int referenceIndex) {
259                this.xj = xj;
260                this.referenceIndex = referenceIndex;
261
262                // determine which of the two halves of the array is larger
263                int lowerSize = referenceIndex, upperSize = xj.getSize() - referenceIndex - 1;
264                lowerIsBigger = lowerSize > upperSize;
265
266                // Set the array sizes
267                if (lowerIsBigger) {
268                        nA = upperSize;
269                        nB = lowerSize;
270                } else {
271                        nA = lowerSize;
272                        nB = upperSize;
273                }
274                setDiffs();
275        }
276
277        @Override
278        protected double getA(int i) {
279                return (!lowerIsBigger) ? getLower(i) : getUpper(i);
280        }
281        @Override
282        protected double getB(int i) {
283                return (!lowerIsBigger) ? getUpper(i) : getLower(i);
284        }
285        
286        private double getLower(int i) {
287                return xj.getDouble(referenceIndex) - xj.getDouble(referenceIndex - 1 - i);
288        }
289        private double getUpper(int i) {
290                return xj.getDouble(i + referenceIndex + 1) - xj.getDouble(referenceIndex);
291        }
292}