001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.validator.routines; 018 019import java.io.Serializable; 020import java.net.URI; 021import java.net.URISyntaxException; 022import java.util.Collections; 023import java.util.HashSet; 024import java.util.Locale; 025import java.util.Set; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028 029import org.apache.commons.validator.GenericValidator; 030 031/** 032 * <p><b>URL Validation</b> routines.</p> 033 * Behavior of validation is modified by passing in options: 034 * <ul> 035 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 036 * component.</li> 037 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 038 * included then fragments are flagged as illegal.</li> 039 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 040 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 041 * </ul> 042 * 043 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 044 * https://javascript.internet.com. However, this validation now bears little resemblance 045 * to the php original.</p> 046 * <pre> 047 * Example of usage: 048 * Construct a UrlValidator with valid schemes of "http", and "https". 049 * 050 * String[] schemes = {"http","https"}. 051 * UrlValidator urlValidator = new UrlValidator(schemes); 052 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 053 * System.out.println("URL is valid"); 054 * } else { 055 * System.out.println("URL is invalid"); 056 * } 057 * 058 * prints "URL is invalid" 059 * If instead the default constructor is used. 060 * 061 * UrlValidator urlValidator = new UrlValidator(); 062 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 063 * System.out.println("URL is valid"); 064 * } else { 065 * System.out.println("URL is invalid"); 066 * } 067 * 068 * prints out "URL is valid" 069 * </pre> 070 * 071 * @see 072 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 073 * Uniform Resource Identifiers (URI): Generic Syntax 074 * </a> 075 * 076 * @since 1.4 077 */ 078public class UrlValidator implements Serializable { 079 080 private static final long serialVersionUID = 7557161713937335013L; 081 082 private static final int MAX_UNSIGNED_16_BIT_INT = 0xFFFF; // port max 083 084 /** 085 * Allows all validly formatted schemes to pass validation instead of 086 * supplying a set of valid schemes. 087 */ 088 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 089 090 /** 091 * Allow two slashes in the path component of the URL. 092 */ 093 public static final long ALLOW_2_SLASHES = 1 << 1; 094 095 /** 096 * Enabling this options disallows any URL fragments. 097 */ 098 public static final long NO_FRAGMENTS = 1 << 2; 099 100 /** 101 * Allow local URLs, such as https://localhost/ or https://machine/ . 102 * This enables a broad-brush check, for complex local machine name 103 * validation requirements you should create your validator with 104 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 105 */ 106 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 107 108 /** 109 * Protocol scheme (e.g. http, ftp, https). 110 */ 111 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 112 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 113 114 // Drop numeric, and "+-." for now 115 // TODO does not allow for optional userinfo. 116 // Validation of character set is done by isValidAuthority 117 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 118 // Allow for IPv4 mapped addresses: ::FFF:123.123.123.123 119 private static final String IPV6_REGEX = "::FFFF:(?:\\d{1,3}\\.){3}\\d{1,3}|[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 120 121 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 122 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 123 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 124 // We assume that password has the same valid chars as user info 125 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 126 127 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 128 private static final String USERINFO_FIELD_REGEX = 129 USERINFO_CHARS_REGEX + "+" + // At least one character for the name 130 "(?::" + USERINFO_CHARS_REGEX + "*)?@"; // colon and password may be absent 131 132 private static final String AUTHORITY_REGEX = 133 "(?:\\[(" + IPV6_REGEX + ")\\]|(?:(?:" + USERINFO_FIELD_REGEX + ")?([" + AUTHORITY_CHARS_REGEX + "]*)))(?::(\\d*))?(.*)?"; 134 // 1 e.g. user:pass@ 2 3 4 135 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 136 137 private static final int PARSE_AUTHORITY_IPV6 = 1; 138 139 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 140 141 private static final int PARSE_AUTHORITY_PORT = 3; // excludes leading colon 142 143 /** 144 * Should always be empty. The code currently allows spaces. 145 */ 146 private static final int PARSE_AUTHORITY_EXTRA = 4; 147 148 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 149 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 150 151 private static final String QUERY_REGEX = "^(\\S*)$"; 152 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 153 154 /** 155 * If no schemes are provided, default to this set. 156 */ 157 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 158 159 /** 160 * Singleton instance of this class with default schemes and options. 161 */ 162 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 163 164 /** 165 * Returns the singleton instance of this class with default schemes and options. 166 * @return singleton instance with default schemes and options 167 */ 168 public static UrlValidator getInstance() { 169 return DEFAULT_URL_VALIDATOR; 170 } 171 172 /** 173 * Tests whether the given flag is on. If the flag is not a power of 2 174 * (e.g. 3) this tests whether the combination of flags is on. 175 * 176 * @param flag Flag value to check. 177 * @param options what to check 178 * 179 * @return whether the specified flag value is on. 180 */ 181 private static boolean isOn(final long flag, final long options) { 182 return (options & flag) > 0; 183 } 184 185 /** 186 * Holds the set of current validation options. 187 */ 188 private final long options; 189 190 /** 191 * The set of schemes that are allowed to be in a URL. 192 */ 193 private final Set<String> allowedSchemes; // Must be lower-case 194 195 /** 196 * Regular expressions used to manually validate authorities if IANA 197 * domain name validation isn't desired. 198 */ 199 private final RegexValidator authorityValidator; 200 201 private final DomainValidator domainValidator; 202 203 /** 204 * Create a UrlValidator with default properties. 205 */ 206 public UrlValidator() { 207 this(null); 208 } 209 210 /** 211 * Initialize a UrlValidator with the given validation options. 212 * @param options The options should be set using the public constants declared in 213 * this class. To set multiple options you simply add them together. For example, 214 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 215 */ 216 public UrlValidator(final long options) { 217 this(null, null, options); 218 } 219 220 /** 221 * Initialize a UrlValidator with the given validation options. 222 * @param authorityValidator Regular expression validator used to validate the authority part 223 * This allows the user to override the standard set of domains. 224 * @param options Validation options. Set using the public constants of this class. 225 * To set multiple options, simply add them together: 226 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 227 * enables both of those options. 228 */ 229 public UrlValidator(final RegexValidator authorityValidator, final long options) { 230 this(null, authorityValidator, options); 231 } 232 233 /** 234 * Behavior of validation is modified by passing in several strings options: 235 * @param schemes Pass in one or more URL schemes to consider valid, passing in 236 * a null will default to "http,https,ftp" being valid. 237 * If a non-null schemes is specified then all valid schemes must 238 * be specified. Setting the ALLOW_ALL_SCHEMES option will 239 * ignore the contents of schemes. 240 */ 241 public UrlValidator(final String[] schemes) { 242 this(schemes, 0L); 243 } 244 245 /** 246 * Behavior of validation is modified by passing in options: 247 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 248 * @param options The options should be set using the public constants declared in 249 * this class. To set multiple options you simply add them together. For example, 250 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 251 */ 252 public UrlValidator(final String[] schemes, final long options) { 253 this(schemes, null, options); 254 } 255 256 /** 257 * Customizable constructor. Validation behavior is modified by passing in options. 258 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 259 * @param authorityValidator Regular expression validator used to validate the authority part 260 * @param options Validation options. Set using the public constants of this class. 261 * To set multiple options, simply add them together: 262 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 263 * enables both of those options. 264 */ 265 public UrlValidator(final String[] schemes, final RegexValidator authorityValidator, final long options) { 266 this(schemes, authorityValidator, options, DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS, options))); 267 } 268 269 /** 270 * Customizable constructor. Validation behavior is modified by passing in options. 271 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 272 * @param authorityValidator Regular expression validator used to validate the authority part 273 * @param options Validation options. Set using the public constants of this class. 274 * To set multiple options, simply add them together: 275 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 276 * enables both of those options. 277 * @param domainValidator the DomainValidator to use; must agree with ALLOW_LOCAL_URLS setting 278 * @since 1.7 279 */ 280 public UrlValidator(String[] schemes, final RegexValidator authorityValidator, final long options, final DomainValidator domainValidator) { 281 this.options = options; 282 if (domainValidator == null) { 283 throw new IllegalArgumentException("DomainValidator must not be null"); 284 } 285 if (domainValidator.isAllowLocal() != (options & ALLOW_LOCAL_URLS) > 0) { 286 throw new IllegalArgumentException("DomainValidator disagrees with ALLOW_LOCAL_URLS setting"); 287 } 288 this.domainValidator = domainValidator; 289 290 if (isOn(ALLOW_ALL_SCHEMES)) { 291 allowedSchemes = Collections.emptySet(); 292 } else { 293 if (schemes == null) { 294 schemes = DEFAULT_SCHEMES; 295 } 296 allowedSchemes = new HashSet<>(schemes.length); 297 for (final String scheme : schemes) { 298 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH)); 299 } 300 } 301 302 this.authorityValidator = authorityValidator; 303 } 304 305 /** 306 * Returns the number of times the token appears in the target. 307 * @param token Token value to be counted. 308 * @param target Target value to count tokens in. 309 * @return the number of tokens. 310 */ 311 protected int countToken(final String token, final String target) { 312 int tokenIndex = 0; 313 int count = 0; 314 while (tokenIndex != -1) { 315 tokenIndex = target.indexOf(token, tokenIndex); 316 if (tokenIndex > -1) { 317 tokenIndex++; 318 count++; 319 } 320 } 321 return count; 322 } 323 324 /** 325 * Tests whether the given flag is off. If the flag is not a power of 2 326 * (ie. 3) this tests whether the combination of flags is off. 327 * 328 * @param flag Flag value to check. 329 * 330 * @return whether the specified flag value is off. 331 */ 332 private boolean isOff(final long flag) { 333 return (options & flag) == 0; 334 } 335 336 /** 337 * Tests whether the given flag is on. If the flag is not a power of 2 338 * (ie. 3) this tests whether the combination of flags is on. 339 * 340 * @param flag Flag value to check. 341 * 342 * @return whether the specified flag value is on. 343 */ 344 private boolean isOn(final long flag) { 345 return (options & flag) > 0; 346 } 347 348 /** 349 * <p>Checks if a field has a valid URL address.</p> 350 * 351 * Note that the method calls #isValidAuthority() 352 * which checks that the domain is valid. 353 * 354 * @param value The value validation is being performed on. A {@code null} 355 * value is considered invalid. 356 * @return true if the URL is valid. 357 */ 358 public boolean isValid(final String value) { 359 if (value == null) { 360 return false; 361 } 362 363 URI uri; // ensure value is a valid URI 364 try { 365 uri = new URI(value); 366 } catch (final URISyntaxException e) { 367 return false; 368 } 369 // OK, perform additional validation 370 371 final String scheme = uri.getScheme(); 372 if (!isValidScheme(scheme)) { 373 return false; 374 } 375 376 final String authority = uri.getRawAuthority(); 377 if ("file".equals(scheme) && GenericValidator.isBlankOrNull(authority)) { // Special case - file: allows an empty authority 378 return true; // this is a local file - nothing more to do here 379 } 380 if ("file".equals(scheme) && authority != null && authority.contains(":")) { 381 return false; 382 } 383 // Validate the authority 384 if (!isValidAuthority(authority)) { 385 return false; 386 } 387 388 if (!isValidPath(uri.getRawPath())) { 389 return false; 390 } 391 392 if (!isValidQuery(uri.getRawQuery())) { 393 return false; 394 } 395 396 if (!isValidFragment(uri.getRawFragment())) { 397 return false; 398 } 399 400 return true; 401 } 402 403 /** 404 * Returns true if the authority is properly formatted. An authority is the combination 405 * of hostname and port. A {@code null} authority value is considered invalid. 406 * Note: this implementation validates the domain unless a RegexValidator was provided. 407 * If a RegexValidator was supplied and it matches, then the authority is regarded 408 * as valid with no further checks, otherwise the method checks against the 409 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 410 * @param authority Authority value to validate, alllows IDN 411 * @return true if authority (hostname and port) is valid. 412 */ 413 protected boolean isValidAuthority(final String authority) { 414 if (authority == null) { 415 return false; 416 } 417 418 // check manual authority validation if specified 419 if (authorityValidator != null && authorityValidator.isValid(authority)) { 420 return true; 421 } 422 // convert to ASCII if possible 423 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 424 425 final Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 426 if (!authorityMatcher.matches()) { 427 return false; 428 } 429 430 // We have to process IPV6 separately because that is parsed in a different group 431 final String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 432 if (ipv6 != null) { 433 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 434 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 435 return false; 436 } 437 } else { 438 final String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 439 // check if authority is hostname or IP address: 440 // try a hostname first since that's much more likely 441 if (!this.domainValidator.isValid(hostLocation)) { 442 // try an IPv4 address 443 final InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 444 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 445 // isn't IPv4, so the URL is invalid 446 return false; 447 } 448 } 449 final String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); 450 if (!GenericValidator.isBlankOrNull(port)) { 451 try { 452 final int iPort = Integer.parseInt(port); 453 if (iPort < 0 || iPort > MAX_UNSIGNED_16_BIT_INT) { 454 return false; 455 } 456 } catch (final NumberFormatException nfe) { 457 return false; // this can happen for big numbers 458 } 459 } 460 } 461 462 final String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 463 if (extra != null && !extra.trim().isEmpty()) { 464 return false; 465 } 466 467 return true; 468 } 469 470 /** 471 * Returns true if the given fragment is null or fragments are allowed. 472 * @param fragment Fragment value to validate. 473 * @return true if fragment is valid. 474 */ 475 protected boolean isValidFragment(final String fragment) { 476 if (fragment == null) { 477 return true; 478 } 479 480 return isOff(NO_FRAGMENTS); 481 } 482 483 /** 484 * Returns true if the path is valid. A {@code null} value is considered invalid. 485 * @param path Path value to validate. 486 * @return true if path is valid. 487 */ 488 protected boolean isValidPath(final String path) { 489 if (path == null) { 490 return false; 491 } 492 493 if (!PATH_PATTERN.matcher(path).matches()) { 494 return false; 495 } 496 497 try { 498 // Don't omit host otherwise leading path may be taken as host if it starts with // 499 final URI uri = new URI(null, "localhost", path, null); 500 final String norm = uri.normalize().getPath(); 501 if (norm.startsWith("/../") // Trying to go via the parent dir 502 || norm.equals("/..")) { // Trying to go to the parent dir 503 return false; 504 } 505 } catch (final URISyntaxException e) { 506 return false; 507 } 508 509 final int slash2Count = countToken("//", path); 510 if (isOff(ALLOW_2_SLASHES) && slash2Count > 0) { 511 return false; 512 } 513 514 return true; 515 } 516 517 /** 518 * Returns true if the query is null or it's a properly formatted query string. 519 * @param query Query value to validate. 520 * @return true if query is valid. 521 */ 522 protected boolean isValidQuery(final String query) { 523 if (query == null) { 524 return true; 525 } 526 return QUERY_PATTERN.matcher(query).matches(); 527 } 528 529 /** 530 * Validate scheme. If schemes[] was initialized to a non null, 531 * then only those schemes are allowed. 532 * Otherwise the default schemes are "http", "https", "ftp". 533 * Matching is case-blind. 534 * @param scheme The scheme to validate. A {@code null} value is considered 535 * invalid. 536 * @return true if valid. 537 */ 538 protected boolean isValidScheme(final String scheme) { 539 if (scheme == null) { 540 return false; 541 } 542 543 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 544 return false; 545 } 546 547 if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 548 return false; 549 } 550 551 return true; 552 } 553 554}