001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.io.UnsupportedEncodingException; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.Iterator; 025import java.util.LinkedHashMap; 026import java.util.LinkedHashSet; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Objects; 030import java.util.Set; 031 032import org.apache.commons.lang3.ArrayUtils; 033import org.apache.commons.lang3.StringUtils; 034 035/** 036 * <p> 037 * Convert from one alphabet to another, with the possibility of leaving certain 038 * characters unencoded. 039 * </p> 040 * 041 * <p> 042 * The target and 'do not encode' languages must be in the Unicode BMP, but the 043 * source language does not. 044 * </p> 045 * 046 * <p> 047 * The encoding will all be of a fixed length, except for the 'do not encode' 048 * chars, which will be of length 1 049 * </p> 050 * 051 * <h2>Sample usage</h2> 052 * 053 * <pre> 054 * Character[] originals; // a, b, c, d 055 * Character[] encoding; // 0, 1, d 056 * Character[] doNotEncode; // d 057 * 058 * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, 059 * encoding, doNotEncode); 060 * 061 * ac.encode("a"); // 00 062 * ac.encode("b"); // 01 063 * ac.encode("c"); // 0d 064 * ac.encode("d"); // d 065 * ac.encode("abcd"); // 00010dd 066 * </pre> 067 * 068 * <p> 069 * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not 070 * change internal state. 071 * </p> 072 * 073 * @since 1.0 074 */ 075public final class AlphabetConverter { 076 077 /** 078 * Arrow constant, used for converting the object into a string. 079 */ 080 private static final String ARROW = " -> "; 081 082 /** 083 * Creates new String that contains just the given code point. 084 * 085 * @param i code point 086 * @return a new string with the new code point 087 * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" 088 */ 089 private static String codePointToString(final int i) { 090 if (Character.charCount(i) == 1) { 091 return String.valueOf((char) i); 092 } 093 return new String(Character.toChars(i)); 094 } 095 096 /** 097 * Converts characters to integers. 098 * 099 * @param chars array of characters 100 * @return an equivalent array of integers 101 */ 102 private static Integer[] convertCharsToIntegers(final Character[] chars) { 103 if (ArrayUtils.isEmpty(chars)) { 104 return ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY; 105 } 106 final Integer[] integers = new Integer[chars.length]; 107 Arrays.setAll(integers, i -> (int) chars[i]); 108 return integers; 109 } 110 111 /** 112 * Creates an alphabet converter, for converting from the original alphabet, 113 * to the encoded alphabet, while leaving 114 * the characters in <em>doNotEncode</em> as they are (if possible). 115 * 116 * <p>Duplicate letters in either original or encoding will be ignored.</p> 117 * 118 * @param original an array of ints representing the original alphabet in 119 * code points 120 * @param encoding an array of ints representing the alphabet to be used for 121 * encoding, in code points 122 * @param doNotEncode an array of ints representing the chars to be encoded 123 * using the original alphabet - every char 124 * here must appear in both the previous params 125 * @return The AlphabetConverter 126 * @throws IllegalArgumentException if an AlphabetConverter cannot be 127 * constructed 128 */ 129 public static AlphabetConverter createConverter( 130 final Integer[] original, 131 final Integer[] encoding, 132 final Integer[] doNotEncode) { 133 final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.asList(original)); 134 final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.asList(encoding)); 135 final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.asList(doNotEncode)); 136 137 final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); 138 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 139 final Map<Integer, String> doNotEncodeMap = new HashMap<>(); 140 141 final int encodedLetterLength; 142 143 for (final int i : doNotEncodeCopy) { 144 if (!originalCopy.contains(i)) { 145 throw new IllegalArgumentException( 146 "Can not use 'do not encode' list because original " 147 + "alphabet does not contain '" 148 + codePointToString(i) + "'"); 149 } 150 151 if (!encodingCopy.contains(i)) { 152 throw new IllegalArgumentException( 153 "Can not use 'do not encode' list because encoding alphabet does not contain '" 154 + codePointToString(i) + "'"); 155 } 156 157 doNotEncodeMap.put(i, codePointToString(i)); 158 } 159 160 if (encodingCopy.size() >= originalCopy.size()) { 161 encodedLetterLength = 1; 162 163 final Iterator<Integer> it = encodingCopy.iterator(); 164 165 for (final int originalLetter : originalCopy) { 166 final String originalLetterAsString = codePointToString(originalLetter); 167 168 if (doNotEncodeMap.containsKey(originalLetter)) { 169 originalToEncoded.put(originalLetter, originalLetterAsString); 170 encodedToOriginal.put(originalLetterAsString, originalLetterAsString); 171 } else { 172 Integer next = it.next(); 173 174 while (doNotEncodeCopy.contains(next)) { 175 next = it.next(); 176 } 177 178 final String encodedLetter = codePointToString(next); 179 180 originalToEncoded.put(originalLetter, encodedLetter); 181 encodedToOriginal.put(encodedLetter, originalLetterAsString); 182 } 183 } 184 185 return new AlphabetConverter(originalToEncoded, encodedToOriginal, encodedLetterLength); 186 187 } 188 if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { 189 throw new IllegalArgumentException( 190 "Must have at least two encoding characters (excluding " 191 + "those in the 'do not encode' list), but has " 192 + (encodingCopy.size() - doNotEncodeCopy.size())); 193 } 194 // we start with one which is our minimum, and because we do the 195 // first division outside the loop 196 int lettersSoFar = 1; 197 198 // the first division takes into account that the doNotEncode 199 // letters can't be in the leftmost place 200 int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) 201 / (encodingCopy.size() - doNotEncodeCopy.size()); 202 203 while (lettersLeft / encodingCopy.size() >= 1) { 204 lettersLeft /= encodingCopy.size(); 205 lettersSoFar++; 206 } 207 208 encodedLetterLength = lettersSoFar + 1; 209 210 final AlphabetConverter ac = 211 new AlphabetConverter(originalToEncoded, 212 encodedToOriginal, 213 encodedLetterLength); 214 215 ac.addSingleEncoding(encodedLetterLength, 216 StringUtils.EMPTY, 217 encodingCopy, 218 originalCopy.iterator(), 219 doNotEncodeMap); 220 221 return ac; 222 } 223 224 /** 225 * Creates an alphabet converter, for converting from the original alphabet, 226 * to the encoded alphabet, while leaving the characters in 227 * <em>doNotEncode</em> as they are (if possible). 228 * 229 * <p>Duplicate letters in either original or encoding will be ignored.</p> 230 * 231 * @param original an array of chars representing the original alphabet 232 * @param encoding an array of chars representing the alphabet to be used 233 * for encoding 234 * @param doNotEncode an array of chars to be encoded using the original 235 * alphabet - every char here must appear in 236 * both the previous params 237 * @return The AlphabetConverter 238 * @throws IllegalArgumentException if an AlphabetConverter cannot be 239 * constructed 240 */ 241 public static AlphabetConverter createConverterFromChars( 242 final Character[] original, 243 final Character[] encoding, 244 final Character[] doNotEncode) { 245 return AlphabetConverter.createConverter( 246 convertCharsToIntegers(original), 247 convertCharsToIntegers(encoding), 248 convertCharsToIntegers(doNotEncode)); 249 } 250 251 /** 252 * Creates a new converter from a map. 253 * 254 * @param originalToEncoded a map returned from getOriginalToEncoded() 255 * @return The reconstructed AlphabetConverter 256 * @see AlphabetConverter#getOriginalToEncoded() 257 */ 258 public static AlphabetConverter createConverterFromMap(final Map<Integer, String> originalToEncoded) { 259 final Map<Integer, String> unmodifiableOriginalToEncoded = Collections.unmodifiableMap(originalToEncoded); 260 final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); 261 262 int encodedLetterLength = 1; 263 264 for (final Entry<Integer, String> e : unmodifiableOriginalToEncoded.entrySet()) { 265 encodedToOriginal.put(e.getValue(), codePointToString(e.getKey())); 266 267 if (e.getValue().length() > encodedLetterLength) { 268 encodedLetterLength = e.getValue().length(); 269 } 270 } 271 272 return new AlphabetConverter(unmodifiableOriginalToEncoded, encodedToOriginal, encodedLetterLength); 273 } 274 275 /** 276 * Original string to be encoded. 277 */ 278 private final Map<Integer, String> originalToEncoded; 279 280 /** 281 * Encoding alphabet. 282 */ 283 private final Map<String, String> encodedToOriginal; 284 285 /** 286 * Length of the encoded letter. 287 */ 288 private final int encodedLetterLength; 289 290 /** 291 * Hidden constructor for alphabet converter. Used by static helper methods. 292 * 293 * @param originalToEncoded original string to be encoded 294 * @param encodedToOriginal encoding alphabet 295 * @param encodedLetterLength length of the encoded letter 296 */ 297 private AlphabetConverter(final Map<Integer, String> originalToEncoded, 298 final Map<String, String> encodedToOriginal, 299 final int encodedLetterLength) { 300 301 this.originalToEncoded = originalToEncoded; 302 this.encodedToOriginal = encodedToOriginal; 303 this.encodedLetterLength = encodedLetterLength; 304 } 305 306 /** 307 * Recursive method used when creating encoder/decoder. 308 * 309 * @param level at which point it should add a single encoding 310 * @param currentEncoding current encoding 311 * @param encoding letters encoding 312 * @param originals original values 313 * @param doNotEncodeMap map of values that should not be encoded 314 */ 315 private void addSingleEncoding(final int level, 316 final String currentEncoding, 317 final Collection<Integer> encoding, 318 final Iterator<Integer> originals, 319 final Map<Integer, String> doNotEncodeMap) { 320 321 if (level > 0) { 322 for (final int encodingLetter : encoding) { 323 if (!originals.hasNext()) { 324 return; // done encoding all the original alphabet 325 } 326 // this skips the doNotEncode chars if they are in the 327 // leftmost place 328 if (level != encodedLetterLength 329 || !doNotEncodeMap.containsKey(encodingLetter)) { 330 addSingleEncoding(level - 1, 331 currentEncoding 332 + codePointToString(encodingLetter), 333 encoding, 334 originals, 335 doNotEncodeMap 336 ); 337 } 338 } 339 } else { 340 Integer next = originals.next(); 341 342 while (doNotEncodeMap.containsKey(next)) { 343 final String originalLetterAsString = codePointToString(next); 344 345 originalToEncoded.put(next, originalLetterAsString); 346 encodedToOriginal.put(originalLetterAsString, 347 originalLetterAsString); 348 349 if (!originals.hasNext()) { 350 return; 351 } 352 353 next = originals.next(); 354 } 355 356 final String originalLetterAsString = codePointToString(next); 357 358 originalToEncoded.put(next, currentEncoding); 359 encodedToOriginal.put(currentEncoding, originalLetterAsString); 360 } 361 } 362 363 /** 364 * Decodes a given string. 365 * 366 * @param encoded a string that has been encoded using this 367 * AlphabetConverter 368 * @return The decoded string, {@code null} if the given string is null 369 * @throws UnsupportedEncodingException if unexpected characters that 370 * cannot be handled are encountered 371 */ 372 public String decode(final String encoded) 373 throws UnsupportedEncodingException { 374 if (encoded == null) { 375 return null; 376 } 377 378 final StringBuilder result = new StringBuilder(); 379 380 for (int j = 0; j < encoded.length();) { 381 final int i = encoded.codePointAt(j); 382 final String s = codePointToString(i); 383 384 if (s.equals(originalToEncoded.get(i))) { 385 result.append(s); 386 j++; // because we do not encode in Unicode extended the 387 // length of each encoded char is 1 388 } else { 389 if (j + encodedLetterLength > encoded.length()) { 390 throw new UnsupportedEncodingException("Unexpected end " 391 + "of string while decoding " + encoded); 392 } 393 final String nextGroup = encoded.substring(j, 394 j + encodedLetterLength); 395 final String next = encodedToOriginal.get(nextGroup); 396 if (next == null) { 397 throw new UnsupportedEncodingException( 398 "Unexpected string without decoding (" 399 + nextGroup + ") in " + encoded); 400 } 401 result.append(next); 402 j += encodedLetterLength; 403 } 404 } 405 406 return result.toString(); 407 } 408 409 /** 410 * Encodes a given string. 411 * 412 * @param original the string to be encoded 413 * @return The encoded string, {@code null} if the given string is null 414 * @throws UnsupportedEncodingException if chars that are not supported are 415 * encountered 416 */ 417 public String encode(final String original) 418 throws UnsupportedEncodingException { 419 if (original == null) { 420 return null; 421 } 422 423 final StringBuilder sb = new StringBuilder(); 424 425 for (int i = 0; i < original.length();) { 426 final int codePoint = original.codePointAt(i); 427 428 final String nextLetter = originalToEncoded.get(codePoint); 429 430 if (nextLetter == null) { 431 throw new UnsupportedEncodingException( 432 "Couldn't find encoding for '" 433 + codePointToString(codePoint) 434 + "' in " 435 + original 436 ); 437 } 438 439 sb.append(nextLetter); 440 441 i += Character.charCount(codePoint); 442 } 443 444 return sb.toString(); 445 } 446 447 @Override 448 public boolean equals(final Object obj) { 449 if (obj == null) { 450 return false; 451 } 452 if (obj == this) { 453 return true; 454 } 455 if (!(obj instanceof AlphabetConverter)) { 456 return false; 457 } 458 final AlphabetConverter other = (AlphabetConverter) obj; 459 return originalToEncoded.equals(other.originalToEncoded) 460 && encodedToOriginal.equals(other.encodedToOriginal) 461 && encodedLetterLength == other.encodedLetterLength; 462 } 463 464 /** 465 * Gets the length of characters in the encoded alphabet that are necessary 466 * for each character in the original 467 * alphabet. 468 * 469 * @return The length of the encoded char 470 */ 471 public int getEncodedCharLength() { 472 return encodedLetterLength; 473 } 474 475 /** 476 * Gets the mapping from integer code point of source language to encoded 477 * string. Use to reconstruct converter from 478 * serialized map. 479 * 480 * @return The original map 481 */ 482 public Map<Integer, String> getOriginalToEncoded() { 483 return Collections.unmodifiableMap(originalToEncoded); 484 } 485 486 @Override 487 public int hashCode() { 488 return Objects.hash(originalToEncoded, 489 encodedToOriginal, 490 encodedLetterLength); 491 } 492 493 @Override 494 public String toString() { 495 final StringBuilder sb = new StringBuilder(); 496 // @formatter:off 497 originalToEncoded.forEach((k, v) -> 498 sb.append(codePointToString(k)) 499 .append(ARROW) 500 .append(k) 501 .append(System.lineSeparator())); 502 // @formatter:on 503 return sb.toString(); 504 } 505}