View Javadoc
1   /*
2    * Copyright (C) 2013 Alberto Irurueta Carro (alberto@irurueta.com)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *         http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package com.irurueta.numerical.robust;
17  
18  import com.irurueta.statistics.UniformRandomizer;
19  
20  import java.util.HashSet;
21  import java.util.Set;
22  
23  /**
24   * This class computes indices of subsets of samples using a uniform randomizer
25   * to pick random samples as fast as possible.
26   * This class ensures that samples are not repeated within a single subset.
27   */
28  public class FastRandomSubsetSelector extends SubsetSelector {
29  
30      /**
31       * Constant defining whether randomizer needs to be initialized with system
32       * timer. By disabling this option it is ensured that the same result is
33       * obtained on each execution.
34       */
35      public static final boolean DEFAULT_SEED_RANDOMIZER_WITH_TIME = false;
36  
37      /**
38       * Randomizer to pick random indexes.
39       */
40      private UniformRandomizer randomizer;
41  
42      /**
43       * Set containing selected indices on a given run.
44       * This is kept around between executions to avoid excessive calls to
45       * garbage collector, as random subset generation is likely to be called
46       * a large number of times during robust estimations. Because of that,
47       * if a robust estimator is capable of spanning multiple executions at once
48       * on different threads, then each thread needs to have a different subset
49       * selector instance.
50       */
51      private final Set<Integer> selectedIndices;
52  
53      /**
54       * Constructor.
55       *
56       * @param numSamples number of samples to select subsets from
57       * @throws IllegalArgumentException if provided number of samples is zero
58       *                                  or negative
59       */
60      public FastRandomSubsetSelector(final int numSamples) {
61          this(numSamples, DEFAULT_SEED_RANDOMIZER_WITH_TIME);
62      }
63  
64      /**
65       * Constructor.
66       *
67       * @param numSamples             number of samples to select subsets from.
68       * @param seedRandomizerWithTime true if randomizer seed must be initialized
69       *                               to system timer to obtain more random results.
70       * @throws IllegalArgumentException if provided number of samples is zero
71       *                                  or negative.
72       */
73      public FastRandomSubsetSelector(final int numSamples, final boolean seedRandomizerWithTime) {
74          super(numSamples);
75          createRandomizer(seedRandomizerWithTime);
76          selectedIndices = new HashSet<>();
77      }
78  
79      /**
80       * Returns type of this subset selector.
81       *
82       * @return type of this subset selector.
83       */
84      @Override
85      public SubsetSelectorType getType() {
86          return SubsetSelectorType.FAST_RANDOM_SUBSET_SELECTOR;
87      }
88  
89      /**
90       * Computes a random subset of indices within range of number of samples to
91       * be used on robust estimators.
92       *
93       * @param subsetSize subset size to be computed. This value must be smaller
94       *                   than total number of samples.
95       * @param result     array containing indices to be picked. Provided array must
96       *                   be at least of length subsetSize. The former subsetSize entries of the
97       *                   array will be modified by this method.
98       * @throws NotEnoughSamplesException  if subset size is greater than the
99       *                                    total number of samples.
100      * @throws InvalidSubsetSizeException if subset size is zero or if result
101      *                                    array does not have at least a length of subsetSize.
102      */
103     @Override
104     public void computeRandomSubsets(final int subsetSize, final int[] result) throws NotEnoughSamplesException,
105             InvalidSubsetSizeException {
106         if (subsetSize == 0) {
107             throw new InvalidSubsetSizeException();
108         }
109         if (result.length < subsetSize) {
110             throw new InvalidSubsetSizeException();
111         }
112         if (numSamples < subsetSize) {
113             throw new NotEnoughSamplesException();
114         }
115 
116         // On start set of selected indices is empty
117         selectedIndices.clear();
118 
119         var counter = 0;
120         int index;
121         do {
122             index = randomizer.nextInt(0, numSamples);
123 
124             // check whether this index has already been selected
125             if (selectedIndices.contains(index)) {
126                 continue;
127             }
128 
129             // if not selected, pick it now
130             selectedIndices.add(index);
131             result[counter] = index;
132             counter++;
133         } while (counter < subsetSize);
134 
135         selectedIndices.clear();
136     }
137 
138     /**
139      * Computes a random subset of indices within provided range of positions to
140      * be used on robust estimators.
141      *
142      * @param minPos     minimum position to be picked. This value must be greater
143      *                   or equal than zero and smaller than the total number of samples and less
144      *                   than maxPos.
145      * @param maxPos     maximum position to be picked. This value must be greater
146      *                   or equal than zero and smaller than the total number of samples and
147      *                   greater than minPos.
148      * @param subsetSize subset size to be computed. This value must be smaller
149      *                   than total number of samples.
150      * @param pickLast   true indicates that last sample in range must always be
151      *                   picked within subset. This is done to obtain faster execution times and
152      *                   greater stability on some algorithms.
153      * @param result     array containing indices to be picked. Provided array must
154      *                   be at least of length subsetSize. The former subsetSize entries of the
155      *                   array will be modified by this method.
156      * @throws NotEnoughSamplesException   if subset size is greater than the
157      *                                     total number of samples or if maxPos is greater than the total number of
158      *                                     samples.
159      * @throws InvalidSubsetSizeException  if subset size is zero or if result
160      *                                     array does not have at least a length of subsetSize, or ig subset size
161      *                                     is greater than the allowed range of positions to be picked.
162      * @throws InvalidSubsetRangeException if maximum position is smaller than
163      *                                     minimum position or maximum or minimum position are negative.
164      */
165     @Override
166     public void computeRandomSubsetsInRange(
167             final int minPos, final int maxPos, final int subsetSize, final boolean pickLast, final int[] result)
168             throws NotEnoughSamplesException, InvalidSubsetSizeException, InvalidSubsetRangeException {
169         if (subsetSize == 0) {
170             throw new InvalidSubsetSizeException();
171         }
172         if (result.length < subsetSize) {
173             throw new InvalidSubsetSizeException();
174         }
175         if (minPos >= maxPos || maxPos < 0 || minPos < 0) {
176             throw new InvalidSubsetRangeException();
177         }
178         if ((maxPos - minPos) < subsetSize) {
179             throw new InvalidSubsetSizeException();
180         }
181         if (numSamples < subsetSize) {
182             throw new NotEnoughSamplesException();
183         }
184         if (maxPos > numSamples) {
185             throw new NotEnoughSamplesException();
186         }
187 
188         // On start set of selected indices is empty
189         selectedIndices.clear();
190 
191         var counter = 0;
192         int index;
193 
194         if (pickLast) {
195             // this is done to accelerate computations and obtain more
196             // stable results in some cases
197             // pick last element in range
198             index = maxPos - 1;
199             selectedIndices.add(index);
200             result[counter] = index;
201             counter++;
202         }
203 
204         if (!pickLast || counter < result.length) {
205             // keep selecting only if not all elements have already been selected
206             do {
207                 index = randomizer.nextInt(minPos, maxPos);
208 
209                 // check whether this index has already been selected
210                 if (selectedIndices.contains(index)) {
211                     continue;
212                 }
213 
214                 // if not selected, pick it now
215                 selectedIndices.add(index);
216                 result[counter] = index;
217                 counter++;
218             } while (counter < subsetSize);
219         }
220 
221         selectedIndices.clear();
222     }
223 
224     /**
225      * Returns internal randomizer to generate uniformly distributed random
226      * values.
227      *
228      * @return internal randomizer.
229      */
230     protected UniformRandomizer getRandomizer() {
231         return randomizer;
232     }
233 
234     /**
235      * Initializes randomizer for an instance of this class.
236      *
237      * @param seedWithTime if true randomizer will be initialized using current
238      *                     time as seed. If false, randomizer will always generate the same
239      *                     pseudo-random sequence on each JVM execution.
240      */
241     private void createRandomizer(final boolean seedWithTime) {
242         randomizer = new UniformRandomizer();
243         if (seedWithTime) {
244             randomizer.setSeed(System.currentTimeMillis());
245         }
246     }
247 }