View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3;
18  
19  import java.io.Serializable;
20  import java.util.Collections;
21  import java.util.HashMap;
22  import java.util.HashSet;
23  import java.util.Map;
24  import java.util.Set;
25  import java.util.stream.Stream;
26  
27  /**
28   * A set of characters.
29   *
30   * <p>Instances are immutable, but instances of subclasses may not be.</p>
31   *
32   * <p>#ThreadSafe#</p>
33   * @since 1.0
34   */
35  public class CharSet implements Serializable {
36  
37      /**
38       * Required for serialization support. Lang version 2.0.
39       *
40       * @see java.io.Serializable
41       */
42      private static final long serialVersionUID = 5947847346149275958L;
43  
44      /**
45       * A CharSet defining no characters.
46       * @since 2.0
47       */
48      public static final CharSet EMPTY = new CharSet((String) null);
49  
50      /**
51       * A CharSet defining ASCII alphabetic characters "a-zA-Z".
52       * @since 2.0
53       */
54      public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
55  
56      /**
57       * A CharSet defining ASCII alphabetic characters "a-z".
58       * @since 2.0
59       */
60      public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
61  
62      /**
63       * A CharSet defining ASCII alphabetic characters "A-Z".
64       * @since 2.0
65       */
66      public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
67  
68      /**
69       * A CharSet defining ASCII alphabetic characters "0-9".
70       * @since 2.0
71       */
72      public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
73  
74      /**
75       * A Map of the common cases used in the factory.
76       * Subclasses can add more common patterns if desired
77       * @since 2.0
78       */
79      protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
80  
81      static {
82          COMMON.put(null, EMPTY);
83          COMMON.put(StringUtils.EMPTY, EMPTY);
84          COMMON.put("a-zA-Z", ASCII_ALPHA);
85          COMMON.put("A-Za-z", ASCII_ALPHA);
86          COMMON.put("a-z", ASCII_ALPHA_LOWER);
87          COMMON.put("A-Z", ASCII_ALPHA_UPPER);
88          COMMON.put("0-9", ASCII_NUMERIC);
89      }
90  
91      /**
92       * Factory method to create a new CharSet using a special syntax.
93       *
94       * <ul>
95       *  <li>{@code null} or empty string ("")
96       * - set containing no characters</li>
97       *  <li>Single character, such as "a"
98       *  - set containing just that character</li>
99       *  <li>Multi character, such as "a-e"
100      *  - set containing characters from one character to the other</li>
101      *  <li>Negated, such as "^a" or "^a-e"
102      *  - set containing all characters except those defined</li>
103      *  <li>Combinations, such as "abe-g"
104      *  - set containing all the characters from the individual sets</li>
105      * </ul>
106      *
107      * <p>The matching order is:</p>
108      * <ol>
109      *  <li>Negated multi character range, such as "^a-e"
110      *  <li>Ordinary multi character range, such as "a-e"
111      *  <li>Negated single character, such as "^a"
112      *  <li>Ordinary single character, such as "a"
113      * </ol>
114      *
115      * <p>Matching works left to right. Once a match is found the
116      * search starts again from the next character.</p>
117      *
118      * <p>If the same range is defined twice using the same syntax, only
119      * one range will be kept.
120      * Thus, "a-ca-c" creates only one range of "a-c".</p>
121      *
122      * <p>If the start and end of a range are in the wrong order,
123      * they are reversed. Thus "a-e" is the same as "e-a".
124      * As a result, "a-ee-a" would create only one range,
125      * as the "a-e" and "e-a" are the same.</p>
126      *
127      * <p>The set of characters represented is the union of the specified ranges.</p>
128      *
129      * <p>There are two ways to add a literal negation character ({@code ^}):</p>
130      * <ul>
131      *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
132      *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
133      * </ul>
134      *
135      * <p>Examples using the negation character:</p>
136      * <pre>
137      *     CharSet.getInstance("^a-c").contains('a') = false
138      *     CharSet.getInstance("^a-c").contains('d') = true
139      *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
140      *     CharSet.getInstance("^^a-c").contains('^') = false
141      *     CharSet.getInstance("^a-cd-f").contains('d') = true
142      *     CharSet.getInstance("a-c^").contains('^') = true
143      *     CharSet.getInstance("^", "a-c").contains('^') = true
144      * </pre>
145      *
146      * <p>All CharSet objects returned by this method will be immutable.</p>
147      *
148      * @param setStrs  Strings to merge into the set, may be null
149      * @return a CharSet instance
150      * @since 2.4
151      */
152     public static CharSet getInstance(final String... setStrs) {
153         if (setStrs == null) {
154             return null;
155         }
156         if (setStrs.length == 1) {
157             final CharSet common = COMMON.get(setStrs[0]);
158             if (common != null) {
159                 return common;
160             }
161         }
162         return new CharSet(setStrs);
163     }
164 
165     /** The set of CharRange objects. */
166     private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>());
167 
168     /**
169      * Constructs a new CharSet using the set syntax.
170      * Each string is merged in with the set.
171      *
172      * @param set  Strings to merge into the initial set
173      * @throws NullPointerException if set is {@code null}
174      */
175     protected CharSet(final String... set) {
176         Stream.of(set).forEach(this::add);
177     }
178 
179     /**
180      * Add a set definition string to the {@link CharSet}.
181      *
182      * @param str  set definition string
183      */
184     protected void add(final String str) {
185         if (str == null) {
186             return;
187         }
188 
189         final int len = str.length();
190         int pos = 0;
191         while (pos < len) {
192             final int remainder = len - pos;
193             if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
194                 // negated range
195                 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
196                 pos += 4;
197             } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
198                 // range
199                 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
200                 pos += 3;
201             } else if (remainder >= 2 && str.charAt(pos) == '^') {
202                 // negated char
203                 set.add(CharRange.isNot(str.charAt(pos + 1)));
204                 pos += 2;
205             } else {
206                 // char
207                 set.add(CharRange.is(str.charAt(pos)));
208                 pos += 1;
209             }
210         }
211     }
212 
213     /**
214      * Does the {@link CharSet} contain the specified
215      * character {@code ch}.
216      *
217      * @param ch  the character to check for
218      * @return {@code true} if the set contains the characters
219      */
220     public boolean contains(final char ch) {
221         synchronized (set) {
222             return set.stream().anyMatch(range -> range.contains(ch));
223         }
224     }
225 
226     // Basics
227     /**
228      * Compares two {@link CharSet} objects, returning true if they represent
229      * exactly the same set of characters defined in the same way.
230      *
231      * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
232      * equal according to this method.</p>
233      *
234      * @param obj  the object to compare to
235      * @return true if equal
236      * @since 2.0
237      */
238     @Override
239     public boolean equals(final Object obj) {
240         if (obj == this) {
241             return true;
242         }
243         if (!(obj instanceof CharSet)) {
244             return false;
245         }
246         final CharSet other = (CharSet) obj;
247         return set.equals(other.set);
248     }
249 
250     /**
251      * Gets the internal set as an array of CharRange objects.
252      *
253      * @return an array of immutable CharRange objects
254      * @since 2.0
255      */
256 // NOTE: This is no longer public as CharRange is no longer a public class.
257 //       It may be replaced when CharSet moves to Range.
258     /*public*/ CharRange[] getCharRanges() {
259         return set.toArray(CharRange.EMPTY_ARRAY);
260     }
261 
262     /**
263      * Gets a hash code compatible with the equals method.
264      *
265      * @return a suitable hash code
266      * @since 2.0
267      */
268     @Override
269     public int hashCode() {
270         return 89 + set.hashCode();
271     }
272 
273     /**
274      * Gets a string representation of the set.
275      *
276      * @return string representation of the set
277      */
278     @Override
279     public String toString() {
280         return set.toString();
281     }
282 
283 }