001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator; 018 019import java.io.Serializable; 020import java.util.Arrays; 021import java.util.HashSet; 022import java.util.Set; 023import java.util.regex.Matcher; 024import java.util.regex.Pattern; 025 026import org.apache.commons.validator.routines.InetAddressValidator; 027import org.apache.commons.validator.util.Flags; 028 029/** 030 * <p>Validates URLs.</p> 031 * Behavour of validation is modified by passing in options: 032 * <ul> 033 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 034 * component.</li> 035 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 036 * included then fragments are flagged as illegal.</li> 037 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 038 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 039 * </ul> 040 * 041 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 042 * https://javascript.internet.com. However, this validation now bears little resemblance 043 * to the php original.</p> 044 * <pre> 045 * Example of usage: 046 * Construct a UrlValidator with valid schemes of "http", and "https". 047 * 048 * String[] schemes = {"http","https"}. 049 * UrlValidator urlValidator = new UrlValidator(schemes); 050 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 051 * System.out.println("URL is valid"); 052 * } else { 053 * System.out.println("URL is invalid"); 054 * } 055 * 056 * prints "URL is invalid" 057 * If instead the default constructor is used. 058 * 059 * UrlValidator urlValidator = new UrlValidator(); 060 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 061 * System.out.println("URL is valid"); 062 * } else { 063 * System.out.println("URL is invalid"); 064 * } 065 * 066 * prints out "URL is valid" 067 * </pre> 068 * 069 * @see 070 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 071 * Uniform Resource Identifiers (URI): Generic Syntax 072 * </a> 073 * 074 * @since 1.1 075 * @deprecated Use the new UrlValidator in the routines package. This class 076 * will be removed in a future release. 077 */ 078@Deprecated 079public class UrlValidator implements Serializable { 080 081 private static final long serialVersionUID = 24137157400029593L; 082 083 /** 084 * Allows all validly formatted schemes to pass validation instead of 085 * supplying a set of valid schemes. 086 */ 087 public static final int ALLOW_ALL_SCHEMES = 1 << 0; 088 089 /** 090 * Allow two slashes in the path component of the URL. 091 */ 092 public static final int ALLOW_2_SLASHES = 1 << 1; 093 094 /** 095 * Enabling this options disallows any URL fragments. 096 */ 097 public static final int NO_FRAGMENTS = 1 << 2; 098 099 private static final String ALPHA_CHARS = "a-zA-Z"; 100 101// NOT USED private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; 102 103 private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; 104 105 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; 106 107 // Drop numeric, and "+-." for now 108 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; 109 110 private static final String ATOM = VALID_CHARS + '+'; 111 112 /** 113 * This expression derived/taken from the BNF for URI (RFC2396). 114 */ 115 private static final String URL_REGEX = 116 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; 117 // 12 3 4 5 6 7 8 9 118 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX); 119 120 /** 121 * Schema/Protocol (ie. http:, ftp:, file:, etc). 122 */ 123 private static final int PARSE_URL_SCHEME = 2; 124 125 /** 126 * Includes hostname/ip and port number. 127 */ 128 private static final int PARSE_URL_AUTHORITY = 4; 129 130 private static final int PARSE_URL_PATH = 5; 131 132 private static final int PARSE_URL_QUERY = 7; 133 134 private static final int PARSE_URL_FRAGMENT = 9; 135 136 /** 137 * Protocol (ie. http:, ftp:,https:). 138 */ 139 private static final Pattern SCHEME_PATTERN = Pattern.compile("^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"); 140 141 private static final String AUTHORITY_REGEX = 142 "^([" + AUTHORITY_CHARS_REGEX + "]*)(:\\d*)?(.*)?"; 143 // 1 2 3 4 144 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 145 146 private static final int PARSE_AUTHORITY_HOST_IP = 1; 147 148 private static final int PARSE_AUTHORITY_PORT = 2; 149 150 /** 151 * Should always be empty. 152 */ 153 private static final int PARSE_AUTHORITY_EXTRA = 3; 154 155 private static final Pattern PATH_PATTERN = Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$"); 156 157 private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$"); 158 159 private static final Pattern LEGAL_ASCII_PATTERN = Pattern.compile("^\\p{ASCII}+$"); 160 161 private static final Pattern DOMAIN_PATTERN = 162 Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$"); 163 164 private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$"); 165 166 private static final Pattern ATOM_PATTERN = Pattern.compile("^(" + ATOM + ").*?$"); 167 168 private static final Pattern ALPHA_PATTERN = Pattern.compile("^[" + ALPHA_CHARS + "]"); 169 170 /** 171 * Holds the set of current validation options. 172 */ 173 private final Flags options; 174 175 /** 176 * The set of schemes that are allowed to be in a URL. 177 */ 178 private final Set<String> allowedSchemes = new HashSet<>(); 179 180 /** 181 * If no schemes are provided, default to this set. 182 */ 183 protected String[] defaultSchemes = {"http", "https", "ftp"}; 184 185 /** 186 * Create a UrlValidator with default properties. 187 */ 188 public UrlValidator() { 189 this(null); 190 } 191 192 /** 193 * Initialize a UrlValidator with the given validation options. 194 * @param options The options should be set using the public constants declared in 195 * this class. To set multiple options you simply add them together. For example, 196 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 197 */ 198 public UrlValidator(final int options) { 199 this(null, options); 200 } 201 202 /** 203 * Behavior of validation is modified by passing in several strings options: 204 * @param schemes Pass in one or more URL schemes to consider valid, passing in 205 * a null will default to "http,https,ftp" being valid. 206 * If a non-null schemes is specified then all valid schemes must 207 * be specified. Setting the ALLOW_ALL_SCHEMES option will 208 * ignore the contents of schemes. 209 */ 210 public UrlValidator(final String[] schemes) { 211 this(schemes, 0); 212 } 213 214 /** 215 * Behavour of validation is modified by passing in options: 216 * @param schemes The set of valid schemes. 217 * @param options The options should be set using the public constants declared in 218 * this class. To set multiple options you simply add them together. For example, 219 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 220 */ 221 public UrlValidator(String[] schemes, final int options) { 222 this.options = new Flags(options); 223 224 if (this.options.isOn(ALLOW_ALL_SCHEMES)) { 225 return; 226 } 227 228 if (schemes == null) { 229 schemes = this.defaultSchemes; 230 } 231 232 this.allowedSchemes.addAll(Arrays.asList(schemes)); 233 } 234 235 /** 236 * Returns the number of times the token appears in the target. 237 * @param token Token value to be counted. 238 * @param target Target value to count tokens in. 239 * @return the number of tokens. 240 */ 241 protected int countToken(final String token, final String target) { 242 int tokenIndex = 0; 243 int count = 0; 244 while (tokenIndex != -1) { 245 tokenIndex = target.indexOf(token, tokenIndex); 246 if (tokenIndex > -1) { 247 tokenIndex++; 248 count++; 249 } 250 } 251 return count; 252 } 253 254 /** 255 * <p>Checks if a field has a valid URL address.</p> 256 * 257 * @param value The value validation is being performed on. A {@code null} 258 * value is considered invalid. 259 * @return true if the URL is valid. 260 */ 261 public boolean isValid(final String value) { 262 if (value == null) { 263 return false; 264 } 265 if (!LEGAL_ASCII_PATTERN.matcher(value).matches()) { 266 return false; 267 } 268 269 // Check the whole url address structure 270 final Matcher urlMatcher = URL_PATTERN.matcher(value); 271 if (!urlMatcher.matches()) { 272 return false; 273 } 274 275 if (!isValidScheme(urlMatcher.group(PARSE_URL_SCHEME))) { 276 return false; 277 } 278 279 if (!isValidAuthority(urlMatcher.group(PARSE_URL_AUTHORITY))) { 280 return false; 281 } 282 283 if (!isValidPath(urlMatcher.group(PARSE_URL_PATH))) { 284 return false; 285 } 286 287 if (!isValidQuery(urlMatcher.group(PARSE_URL_QUERY))) { 288 return false; 289 } 290 291 if (!isValidFragment(urlMatcher.group(PARSE_URL_FRAGMENT))) { 292 return false; 293 } 294 295 return true; 296 } 297 298 /** 299 * Returns true if the authority is properly formatted. An authority is the combination 300 * of hostname and port. A {@code null} authority value is considered invalid. 301 * @param authority Authority value to validate. 302 * @return true if authority (hostname and port) is valid. 303 */ 304 protected boolean isValidAuthority(final String authority) { 305 if (authority == null) { 306 return false; 307 } 308 309 final InetAddressValidator inetAddressValidator = 310 InetAddressValidator.getInstance(); 311 312 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority); 313 if (!authorityMatcher.matches()) { 314 return false; 315 } 316 317 boolean hostname = false; 318 // check if authority is IP address or hostname 319 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 320 final boolean ipV4Address = inetAddressValidator.isValid(hostIP); 321 322 if (!ipV4Address) { 323 // Domain is hostname name 324 hostname = DOMAIN_PATTERN.matcher(hostIP).matches(); 325 } 326 327 //rightmost hostname will never start with a digit. 328 if (hostname) { 329 // LOW-TECH FIX FOR VALIDATOR-202 330 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203 331 final char[] chars = hostIP.toCharArray(); 332 int size = 1; 333 for (final char element : chars) { 334 if (element == '.') { 335 size++; 336 } 337 } 338 final String[] domainSegment = new String[size]; 339 boolean match = true; 340 int segmentCount = 0; 341 int segmentLength = 0; 342 343 while (match) { 344 final Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP); 345 match = atomMatcher.matches(); 346 if (match) { 347 domainSegment[segmentCount] = atomMatcher.group(1); 348 segmentLength = domainSegment[segmentCount].length() + 1; 349 hostIP = 350 segmentLength >= hostIP.length() 351 ? "" 352 : hostIP.substring(segmentLength); 353 354 segmentCount++; 355 } 356 } 357 final String topLevel = domainSegment[segmentCount - 1]; 358 if (topLevel.length() < 2 || topLevel.length() > 4) { // CHECKSTYLE IGNORE MagicNumber (deprecated code) 359 return false; 360 } 361 362 // First letter of top level must be a alpha 363 if (!ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()) { 364 return false; 365 } 366 367 // Make sure there's a host name preceding the authority. 368 if (segmentCount < 2) { 369 return false; 370 } 371 } 372 373 if (!hostname && !ipV4Address) { 374 return false; 375 } 376 377 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 378 if (port != null && !PORT_PATTERN.matcher(port).matches()) { 379 return false; 380 } 381 382 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 383 if (!GenericValidator.isBlankOrNull(extra)) { 384 return false; 385 } 386 387 return true; 388 } 389 390 /** 391 * Returns true if the given fragment is null or fragments are allowed. 392 * @param fragment Fragment value to validate. 393 * @return true if fragment is valid. 394 */ 395 protected boolean isValidFragment(final String fragment) { 396 if (fragment == null) { 397 return true; 398 } 399 400 return options.isOff(NO_FRAGMENTS); 401 } 402 403 /** 404 * Returns true if the path is valid. A {@code null} value is considered invalid. 405 * @param path Path value to validate. 406 * @return true if path is valid. 407 */ 408 protected boolean isValidPath(final String path) { 409 if (path == null) { 410 return false; 411 } 412 413 if (!PATH_PATTERN.matcher(path).matches()) { 414 return false; 415 } 416 417 final int slash2Count = countToken("//", path); 418 if (options.isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 419 return false; 420 } 421 422 final int slashCount = countToken("/", path); 423 final int dot2Count = countToken("..", path); 424 if (dot2Count > 0 && slashCount - slash2Count - 1 <= dot2Count) { 425 return false; 426 } 427 428 return true; 429 } 430 431 /** 432 * Returns true if the query is null or it's a properly formatted query string. 433 * @param query Query value to validate. 434 * @return true if query is valid. 435 */ 436 protected boolean isValidQuery(final String query) { 437 if (query == null) { 438 return true; 439 } 440 441 return QUERY_PATTERN.matcher(query).matches(); 442 } 443 444 /** 445 * Validate scheme. If schemes[] was initialized to a non null, 446 * then only those scheme's are allowed. Note this is slightly different 447 * than for the constructor. 448 * @param scheme The scheme to validate. A {@code null} value is considered 449 * invalid. 450 * @return true if valid. 451 */ 452 protected boolean isValidScheme(final String scheme) { 453 if (scheme == null) { 454 return false; 455 } 456 457 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 458 return false; 459 } 460 461 if (options.isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme)) { 462 return false; 463 } 464 465 return true; 466 } 467}