001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3;
018
019import java.io.Serializable;
020import java.util.Collections;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.Map;
024import java.util.Set;
025import java.util.stream.Stream;
026
027/**
028 * A set of characters.
029 *
030 * <p>Instances are immutable, but instances of subclasses may not be.</p>
031 *
032 * <p>#ThreadSafe#</p>
033 * @since 1.0
034 */
035public class CharSet implements Serializable {
036
037    /**
038     * Required for serialization support. Lang version 2.0.
039     *
040     * @see java.io.Serializable
041     */
042    private static final long serialVersionUID = 5947847346149275958L;
043
044    /**
045     * A CharSet defining no characters.
046     * @since 2.0
047     */
048    public static final CharSet EMPTY = new CharSet((String) null);
049
050    /**
051     * A CharSet defining ASCII alphabetic characters "a-zA-Z".
052     * @since 2.0
053     */
054    public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z");
055
056    /**
057     * A CharSet defining ASCII alphabetic characters "a-z".
058     * @since 2.0
059     */
060    public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z");
061
062    /**
063     * A CharSet defining ASCII alphabetic characters "A-Z".
064     * @since 2.0
065     */
066    public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z");
067
068    /**
069     * A CharSet defining ASCII alphabetic characters "0-9".
070     * @since 2.0
071     */
072    public static final CharSet ASCII_NUMERIC = new CharSet("0-9");
073
074    /**
075     * A Map of the common cases used in the factory.
076     * Subclasses can add more common patterns if desired
077     * @since 2.0
078     */
079    protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>());
080
081    static {
082        COMMON.put(null, EMPTY);
083        COMMON.put(StringUtils.EMPTY, EMPTY);
084        COMMON.put("a-zA-Z", ASCII_ALPHA);
085        COMMON.put("A-Za-z", ASCII_ALPHA);
086        COMMON.put("a-z", ASCII_ALPHA_LOWER);
087        COMMON.put("A-Z", ASCII_ALPHA_UPPER);
088        COMMON.put("0-9", ASCII_NUMERIC);
089    }
090
091    /**
092     * Factory method to create a new CharSet using a special syntax.
093     *
094     * <ul>
095     *  <li>{@code null} or empty string ("")
096     * - set containing no characters</li>
097     *  <li>Single character, such as "a"
098     *  - set containing just that character</li>
099     *  <li>Multi character, such as "a-e"
100     *  - set containing characters from one character to the other</li>
101     *  <li>Negated, such as "^a" or "^a-e"
102     *  - set containing all characters except those defined</li>
103     *  <li>Combinations, such as "abe-g"
104     *  - set containing all the characters from the individual sets</li>
105     * </ul>
106     *
107     * <p>The matching order is:</p>
108     * <ol>
109     *  <li>Negated multi character range, such as "^a-e"
110     *  <li>Ordinary multi character range, such as "a-e"
111     *  <li>Negated single character, such as "^a"
112     *  <li>Ordinary single character, such as "a"
113     * </ol>
114     *
115     * <p>Matching works left to right. Once a match is found the
116     * search starts again from the next character.</p>
117     *
118     * <p>If the same range is defined twice using the same syntax, only
119     * one range will be kept.
120     * Thus, "a-ca-c" creates only one range of "a-c".</p>
121     *
122     * <p>If the start and end of a range are in the wrong order,
123     * they are reversed. Thus "a-e" is the same as "e-a".
124     * As a result, "a-ee-a" would create only one range,
125     * as the "a-e" and "e-a" are the same.</p>
126     *
127     * <p>The set of characters represented is the union of the specified ranges.</p>
128     *
129     * <p>There are two ways to add a literal negation character ({@code ^}):</p>
130     * <ul>
131     *     <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li>
132     *     <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li>
133     * </ul>
134     *
135     * <p>Examples using the negation character:</p>
136     * <pre>
137     *     CharSet.getInstance("^a-c").contains('a') = false
138     *     CharSet.getInstance("^a-c").contains('d') = true
139     *     CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated)
140     *     CharSet.getInstance("^^a-c").contains('^') = false
141     *     CharSet.getInstance("^a-cd-f").contains('d') = true
142     *     CharSet.getInstance("a-c^").contains('^') = true
143     *     CharSet.getInstance("^", "a-c").contains('^') = true
144     * </pre>
145     *
146     * <p>All CharSet objects returned by this method will be immutable.</p>
147     *
148     * @param setStrs  Strings to merge into the set, may be null
149     * @return a CharSet instance
150     * @since 2.4
151     */
152    public static CharSet getInstance(final String... setStrs) {
153        if (setStrs == null) {
154            return null;
155        }
156        if (setStrs.length == 1) {
157            final CharSet common = COMMON.get(setStrs[0]);
158            if (common != null) {
159                return common;
160            }
161        }
162        return new CharSet(setStrs);
163    }
164
165    /** The set of CharRange objects. */
166    private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>());
167
168    /**
169     * Constructs a new CharSet using the set syntax.
170     * Each string is merged in with the set.
171     *
172     * @param set  Strings to merge into the initial set
173     * @throws NullPointerException if set is {@code null}
174     */
175    protected CharSet(final String... set) {
176        Stream.of(set).forEach(this::add);
177    }
178
179    /**
180     * Add a set definition string to the {@link CharSet}.
181     *
182     * @param str  set definition string
183     */
184    protected void add(final String str) {
185        if (str == null) {
186            return;
187        }
188
189        final int len = str.length();
190        int pos = 0;
191        while (pos < len) {
192            final int remainder = len - pos;
193            if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
194                // negated range
195                set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3)));
196                pos += 4;
197            } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
198                // range
199                set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2)));
200                pos += 3;
201            } else if (remainder >= 2 && str.charAt(pos) == '^') {
202                // negated char
203                set.add(CharRange.isNot(str.charAt(pos + 1)));
204                pos += 2;
205            } else {
206                // char
207                set.add(CharRange.is(str.charAt(pos)));
208                pos += 1;
209            }
210        }
211    }
212
213    /**
214     * Does the {@link CharSet} contain the specified
215     * character {@code ch}.
216     *
217     * @param ch  the character to check for
218     * @return {@code true} if the set contains the characters
219     */
220    public boolean contains(final char ch) {
221        synchronized (set) {
222            return set.stream().anyMatch(range -> range.contains(ch));
223        }
224    }
225
226    // Basics
227    /**
228     * Compares two {@link CharSet} objects, returning true if they represent
229     * exactly the same set of characters defined in the same way.
230     *
231     * <p>The two sets {@code abc} and {@code a-c} are <em>not</em>
232     * equal according to this method.</p>
233     *
234     * @param obj  the object to compare to
235     * @return true if equal
236     * @since 2.0
237     */
238    @Override
239    public boolean equals(final Object obj) {
240        if (obj == this) {
241            return true;
242        }
243        if (!(obj instanceof CharSet)) {
244            return false;
245        }
246        final CharSet other = (CharSet) obj;
247        return set.equals(other.set);
248    }
249
250    /**
251     * Gets the internal set as an array of CharRange objects.
252     *
253     * @return an array of immutable CharRange objects
254     * @since 2.0
255     */
256// NOTE: This is no longer public as CharRange is no longer a public class.
257//       It may be replaced when CharSet moves to Range.
258    /*public*/ CharRange[] getCharRanges() {
259        return set.toArray(CharRange.EMPTY_ARRAY);
260    }
261
262    /**
263     * Gets a hash code compatible with the equals method.
264     *
265     * @return a suitable hash code
266     * @since 2.0
267     */
268    @Override
269    public int hashCode() {
270        return 89 + set.hashCode();
271    }
272
273    /**
274     * Gets a string representation of the set.
275     *
276     * @return string representation of the set
277     */
278    @Override
279    public String toString() {
280        return set.toString();
281    }
282
283}