1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.lang3; 18 19 import java.io.Serializable; 20 import java.util.Collections; 21 import java.util.HashMap; 22 import java.util.HashSet; 23 import java.util.Map; 24 import java.util.Set; 25 import java.util.stream.Stream; 26 27 /** 28 * A set of characters. 29 * 30 * <p>Instances are immutable, but instances of subclasses may not be.</p> 31 * 32 * <p>#ThreadSafe#</p> 33 * @since 1.0 34 */ 35 public class CharSet implements Serializable { 36 37 /** 38 * Required for serialization support. Lang version 2.0. 39 * 40 * @see java.io.Serializable 41 */ 42 private static final long serialVersionUID = 5947847346149275958L; 43 44 /** 45 * A CharSet defining no characters. 46 * @since 2.0 47 */ 48 public static final CharSet EMPTY = new CharSet((String) null); 49 50 /** 51 * A CharSet defining ASCII alphabetic characters "a-zA-Z". 52 * @since 2.0 53 */ 54 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z"); 55 56 /** 57 * A CharSet defining ASCII alphabetic characters "a-z". 58 * @since 2.0 59 */ 60 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z"); 61 62 /** 63 * A CharSet defining ASCII alphabetic characters "A-Z". 64 * @since 2.0 65 */ 66 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z"); 67 68 /** 69 * A CharSet defining ASCII alphabetic characters "0-9". 70 * @since 2.0 71 */ 72 public static final CharSet ASCII_NUMERIC = new CharSet("0-9"); 73 74 /** 75 * A Map of the common cases used in the factory. 76 * Subclasses can add more common patterns if desired 77 * @since 2.0 78 */ 79 protected static final Map<String, CharSet> COMMON = Collections.synchronizedMap(new HashMap<>()); 80 81 static { 82 COMMON.put(null, EMPTY); 83 COMMON.put(StringUtils.EMPTY, EMPTY); 84 COMMON.put("a-zA-Z", ASCII_ALPHA); 85 COMMON.put("A-Za-z", ASCII_ALPHA); 86 COMMON.put("a-z", ASCII_ALPHA_LOWER); 87 COMMON.put("A-Z", ASCII_ALPHA_UPPER); 88 COMMON.put("0-9", ASCII_NUMERIC); 89 } 90 91 /** 92 * Factory method to create a new CharSet using a special syntax. 93 * 94 * <ul> 95 * <li>{@code null} or empty string ("") 96 * - set containing no characters</li> 97 * <li>Single character, such as "a" 98 * - set containing just that character</li> 99 * <li>Multi character, such as "a-e" 100 * - set containing characters from one character to the other</li> 101 * <li>Negated, such as "^a" or "^a-e" 102 * - set containing all characters except those defined</li> 103 * <li>Combinations, such as "abe-g" 104 * - set containing all the characters from the individual sets</li> 105 * </ul> 106 * 107 * <p>The matching order is:</p> 108 * <ol> 109 * <li>Negated multi character range, such as "^a-e" 110 * <li>Ordinary multi character range, such as "a-e" 111 * <li>Negated single character, such as "^a" 112 * <li>Ordinary single character, such as "a" 113 * </ol> 114 * 115 * <p>Matching works left to right. Once a match is found the 116 * search starts again from the next character.</p> 117 * 118 * <p>If the same range is defined twice using the same syntax, only 119 * one range will be kept. 120 * Thus, "a-ca-c" creates only one range of "a-c".</p> 121 * 122 * <p>If the start and end of a range are in the wrong order, 123 * they are reversed. Thus "a-e" is the same as "e-a". 124 * As a result, "a-ee-a" would create only one range, 125 * as the "a-e" and "e-a" are the same.</p> 126 * 127 * <p>The set of characters represented is the union of the specified ranges.</p> 128 * 129 * <p>There are two ways to add a literal negation character ({@code ^}):</p> 130 * <ul> 131 * <li>As the last character in a string, e.g. {@code CharSet.getInstance("a-z^")}</li> 132 * <li>As a separate element, e.g. {@code CharSet.getInstance("^", "a-z")}</li> 133 * </ul> 134 * 135 * <p>Examples using the negation character:</p> 136 * <pre> 137 * CharSet.getInstance("^a-c").contains('a') = false 138 * CharSet.getInstance("^a-c").contains('d') = true 139 * CharSet.getInstance("^^a-c").contains('a') = true // (only '^' is negated) 140 * CharSet.getInstance("^^a-c").contains('^') = false 141 * CharSet.getInstance("^a-cd-f").contains('d') = true 142 * CharSet.getInstance("a-c^").contains('^') = true 143 * CharSet.getInstance("^", "a-c").contains('^') = true 144 * </pre> 145 * 146 * <p>All CharSet objects returned by this method will be immutable.</p> 147 * 148 * @param setStrs Strings to merge into the set, may be null 149 * @return a CharSet instance 150 * @since 2.4 151 */ 152 public static CharSet getInstance(final String... setStrs) { 153 if (setStrs == null) { 154 return null; 155 } 156 if (setStrs.length == 1) { 157 final CharSet common = COMMON.get(setStrs[0]); 158 if (common != null) { 159 return common; 160 } 161 } 162 return new CharSet(setStrs); 163 } 164 165 /** The set of CharRange objects. */ 166 private final Set<CharRange> set = Collections.synchronizedSet(new HashSet<>()); 167 168 /** 169 * Constructs a new CharSet using the set syntax. 170 * Each string is merged in with the set. 171 * 172 * @param set Strings to merge into the initial set 173 * @throws NullPointerException if set is {@code null} 174 */ 175 protected CharSet(final String... set) { 176 Stream.of(set).forEach(this::add); 177 } 178 179 /** 180 * Add a set definition string to the {@link CharSet}. 181 * 182 * @param str set definition string 183 */ 184 protected void add(final String str) { 185 if (str == null) { 186 return; 187 } 188 189 final int len = str.length(); 190 int pos = 0; 191 while (pos < len) { 192 final int remainder = len - pos; 193 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { 194 // negated range 195 set.add(CharRange.isNotIn(str.charAt(pos + 1), str.charAt(pos + 3))); 196 pos += 4; 197 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { 198 // range 199 set.add(CharRange.isIn(str.charAt(pos), str.charAt(pos + 2))); 200 pos += 3; 201 } else if (remainder >= 2 && str.charAt(pos) == '^') { 202 // negated char 203 set.add(CharRange.isNot(str.charAt(pos + 1))); 204 pos += 2; 205 } else { 206 // char 207 set.add(CharRange.is(str.charAt(pos))); 208 pos += 1; 209 } 210 } 211 } 212 213 /** 214 * Does the {@link CharSet} contain the specified 215 * character {@code ch}. 216 * 217 * @param ch the character to check for 218 * @return {@code true} if the set contains the characters 219 */ 220 public boolean contains(final char ch) { 221 synchronized (set) { 222 return set.stream().anyMatch(range -> range.contains(ch)); 223 } 224 } 225 226 // Basics 227 /** 228 * Compares two {@link CharSet} objects, returning true if they represent 229 * exactly the same set of characters defined in the same way. 230 * 231 * <p>The two sets {@code abc} and {@code a-c} are <em>not</em> 232 * equal according to this method.</p> 233 * 234 * @param obj the object to compare to 235 * @return true if equal 236 * @since 2.0 237 */ 238 @Override 239 public boolean equals(final Object obj) { 240 if (obj == this) { 241 return true; 242 } 243 if (!(obj instanceof CharSet)) { 244 return false; 245 } 246 final CharSet other = (CharSet) obj; 247 return set.equals(other.set); 248 } 249 250 /** 251 * Gets the internal set as an array of CharRange objects. 252 * 253 * @return an array of immutable CharRange objects 254 * @since 2.0 255 */ 256 // NOTE: This is no longer public as CharRange is no longer a public class. 257 // It may be replaced when CharSet moves to Range. 258 /*public*/ CharRange[] getCharRanges() { 259 return set.toArray(CharRange.EMPTY_ARRAY); 260 } 261 262 /** 263 * Gets a hash code compatible with the equals method. 264 * 265 * @return a suitable hash code 266 * @since 2.0 267 */ 268 @Override 269 public int hashCode() { 270 return 89 + set.hashCode(); 271 } 272 273 /** 274 * Gets a string representation of the set. 275 * 276 * @return string representation of the set 277 */ 278 @Override 279 public String toString() { 280 return set.toString(); 281 } 282 283 }