001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.archivers.dump; 020 021import java.io.EOFException; 022import java.io.IOException; 023import java.io.InputStream; 024import java.util.Arrays; 025import java.util.BitSet; 026import java.util.HashMap; 027import java.util.Map; 028import java.util.PriorityQueue; 029import java.util.Queue; 030import java.util.Stack; 031 032import org.apache.commons.compress.archivers.ArchiveException; 033import org.apache.commons.compress.archivers.ArchiveInputStream; 034import org.apache.commons.compress.archivers.zip.ZipEncoding; 035import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 036import org.apache.commons.compress.utils.IOUtils; 037 038/** 039 * The DumpArchiveInputStream reads a UNIX dump archive as an InputStream. Methods are provided to position at each successive entry in the archive, and the 040 * read each entry as a normal input stream using read(). 041 * <p> 042 * There doesn't seem to exist a hint on the encoding of string values in any piece documentation. Given the main purpose of dump/restore is backing up a system 043 * it seems very likely the format uses the current default encoding of the system. 044 * </p> 045 * @NotThreadSafe 046 * @since 1.3 047 */ 048public class DumpArchiveInputStream extends ArchiveInputStream<DumpArchiveEntry> { 049 050 private static final String CURRENT_PATH_SEGMENT = "."; 051 private static final String PARENT_PATH_SEGMENT = ".."; 052 053 /** 054 * Look at the first few bytes of the file to decide if it's a dump archive. With 32 bytes we can look at the magic value, with a full 1k we can verify the 055 * checksum. 056 * 057 * @param buffer data to match 058 * @param length length of data 059 * @return whether the buffer seems to contain dump data 060 */ 061 public static boolean matches(final byte[] buffer, final int length) { 062 // do we have enough of the header? 063 if (length < 32) { 064 return false; 065 } 066 067 // this is the best test 068 if (length >= DumpArchiveConstants.TP_SIZE) { 069 return DumpArchiveUtil.verify(buffer); 070 } 071 072 // this will work in a pinch. 073 return DumpArchiveConstants.NFS_MAGIC == DumpArchiveUtil.convert32(buffer, 24); 074 } 075 076 private final DumpArchiveSummary summary; 077 private DumpArchiveEntry active; 078 private boolean isClosed; 079 private boolean hasHitEOF; 080 private long entrySize; 081 private long entryOffset; 082 private int readIdx; 083 private final byte[] readBuf = new byte[DumpArchiveConstants.TP_SIZE]; 084 private byte[] blockBuffer; 085 private int recordOffset; 086 private long filepos; 087 088 protected TapeInputStream raw; 089 090 /** Map of ino -> dirent entry. We can use this to reconstruct full paths. */ 091 private final Map<Integer, Dirent> names = new HashMap<>(); 092 093 /** Map of ino -> (directory) entry when we're missing one or more elements in the path. */ 094 private final Map<Integer, DumpArchiveEntry> pending = new HashMap<>(); 095 096 /** Queue of (directory) entries where we now have the full path. */ 097 private final Queue<DumpArchiveEntry> queue; 098 099 /** 100 * The encoding to use for file names and labels. 101 */ 102 private final ZipEncoding zipEncoding; 103 104 /** 105 * Constructor using the platform's default encoding for file names. 106 * 107 * @param is stream to read from 108 * @throws ArchiveException on error 109 */ 110 public DumpArchiveInputStream(final InputStream is) throws ArchiveException { 111 this(is, null); 112 } 113 114 /** 115 * Constructs a new instance. 116 * 117 * @param is stream to read from 118 * @param encoding the encoding to use for file names, use null for the platform's default encoding 119 * @since 1.6 120 * @throws ArchiveException on error 121 */ 122 public DumpArchiveInputStream(final InputStream is, final String encoding) throws ArchiveException { 123 super(is, encoding); 124 this.raw = new TapeInputStream(is); 125 this.hasHitEOF = false; 126 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 127 128 try { 129 // read header, verify it's a dump archive. 130 final byte[] headerBytes = raw.readRecord(); 131 132 if (!DumpArchiveUtil.verify(headerBytes)) { 133 throw new UnrecognizedFormatException(); 134 } 135 136 // get summary information 137 summary = new DumpArchiveSummary(headerBytes, this.zipEncoding); 138 139 // reset buffer with actual block size. 140 raw.resetBlockSize(summary.getNTRec(), summary.isCompressed()); 141 142 // allocate our read buffer. 143 blockBuffer = new byte[4 * DumpArchiveConstants.TP_SIZE]; 144 145 // skip past CLRI and BITS segments since we don't handle them yet. 146 readCLRI(); 147 readBITS(); 148 } catch (final IOException ex) { 149 throw new ArchiveException(ex.getMessage(), ex); 150 } 151 152 // put in a dummy record for the root node. 153 final Dirent root = new Dirent(2, 2, 4, CURRENT_PATH_SEGMENT); 154 names.put(2, root); 155 156 // use priority based on queue to ensure parent directories are 157 // released first. 158 queue = new PriorityQueue<>(10, (p, q) -> { 159 if (p.getOriginalName() == null || q.getOriginalName() == null) { 160 return Integer.MAX_VALUE; 161 } 162 163 return p.getOriginalName().compareTo(q.getOriginalName()); 164 }); 165 } 166 167 /** 168 * Closes the stream for this entry. 169 */ 170 @Override 171 public void close() throws IOException { 172 if (!isClosed) { 173 isClosed = true; 174 raw.close(); 175 } 176 } 177 178 @Override 179 public long getBytesRead() { 180 return raw.getBytesRead(); 181 } 182 183 @Deprecated 184 @Override 185 public int getCount() { 186 return (int) getBytesRead(); 187 } 188 189 /** 190 * Reads the next entry. 191 * 192 * @return the next entry 193 * @throws IOException on error 194 * @deprecated Use {@link #getNextEntry()}. 195 */ 196 @Deprecated 197 public DumpArchiveEntry getNextDumpEntry() throws IOException { 198 return getNextEntry(); 199 } 200 201 @Override 202 public DumpArchiveEntry getNextEntry() throws IOException { 203 DumpArchiveEntry entry = null; 204 String path = null; 205 206 // is there anything in the queue? 207 if (!queue.isEmpty()) { 208 return queue.remove(); 209 } 210 211 while (entry == null) { 212 if (hasHitEOF) { 213 return null; 214 } 215 216 // skip any remaining records in this segment for prior file. 217 // we might still have holes... easiest to do it 218 // block by block. We may want to revisit this if 219 // the unnecessary decompression time adds up. 220 while (readIdx < active.getHeaderCount()) { 221 if (!active.isSparseRecord(readIdx++) && raw.skip(DumpArchiveConstants.TP_SIZE) == -1) { 222 throw new EOFException(); 223 } 224 } 225 226 readIdx = 0; 227 filepos = raw.getBytesRead(); 228 229 byte[] headerBytes = raw.readRecord(); 230 231 if (!DumpArchiveUtil.verify(headerBytes)) { 232 throw new InvalidFormatException(); 233 } 234 235 active = DumpArchiveEntry.parse(headerBytes); 236 237 // skip any remaining segments for prior file. 238 while (DumpArchiveConstants.SEGMENT_TYPE.ADDR == active.getHeaderType()) { 239 if (raw.skip((long) DumpArchiveConstants.TP_SIZE * (active.getHeaderCount() - active.getHeaderHoles())) == -1) { 240 throw new EOFException(); 241 } 242 243 filepos = raw.getBytesRead(); 244 headerBytes = raw.readRecord(); 245 246 if (!DumpArchiveUtil.verify(headerBytes)) { 247 throw new InvalidFormatException(); 248 } 249 250 active = DumpArchiveEntry.parse(headerBytes); 251 } 252 253 // check if this is an end-of-volume marker. 254 if (DumpArchiveConstants.SEGMENT_TYPE.END == active.getHeaderType()) { 255 hasHitEOF = true; 256 257 return null; 258 } 259 260 entry = active; 261 262 if (entry.isDirectory()) { 263 readDirectoryEntry(active); 264 265 // now we create an empty InputStream. 266 entryOffset = 0; 267 entrySize = 0; 268 readIdx = active.getHeaderCount(); 269 } else { 270 entryOffset = 0; 271 entrySize = active.getEntrySize(); 272 readIdx = 0; 273 } 274 275 recordOffset = readBuf.length; 276 277 path = getPath(entry); 278 279 if (path == null) { 280 entry = null; 281 } 282 } 283 284 entry.setName(path); 285 entry.setSimpleName(names.get(entry.getIno()).getName()); 286 entry.setOffset(filepos); 287 288 return entry; 289 } 290 291 /** 292 * Gets full path for specified archive entry, or null if there's a gap. 293 * 294 * @param entry 295 * @return full path for specified archive entry, or null if there's a gap. 296 * @throws DumpArchiveException Infinite loop detected in directory entries. 297 */ 298 private String getPath(final DumpArchiveEntry entry) throws DumpArchiveException { 299 // build the stack of elements. It's possible that we're 300 // still missing an intermediate value and if so we 301 final Stack<String> elements = new Stack<>(); 302 final BitSet visited = new BitSet(); 303 Dirent dirent = null; 304 for (int i = entry.getIno();; i = dirent.getParentIno()) { 305 if (!names.containsKey(i)) { 306 elements.clear(); 307 break; 308 } 309 if (visited.get(i)) { 310 throw new DumpArchiveException("Duplicate node " + i); 311 } 312 dirent = names.get(i); 313 visited.set(i); 314 elements.push(dirent.getName()); 315 if (dirent.getIno() == dirent.getParentIno()) { 316 break; 317 } 318 } 319 // if an element is missing defer the work and read next entry. 320 if (elements.isEmpty()) { 321 pending.put(entry.getIno(), entry); 322 return null; 323 } 324 // generate full path from stack of elements. 325 final StringBuilder sb = new StringBuilder(elements.pop()); 326 while (!elements.isEmpty()) { 327 sb.append('/'); 328 sb.append(elements.pop()); 329 } 330 return sb.toString(); 331 } 332 333 /** 334 * Gets the archive summary information. 335 * 336 * @return the summary 337 */ 338 public DumpArchiveSummary getSummary() { 339 return summary; 340 } 341 342 /** 343 * Reads bytes from the current dump archive entry. 344 * 345 * This method is aware of the boundaries of the current entry in the archive and will deal with them as if they were this stream's start and EOF. 346 * 347 * @param buf The buffer into which to place bytes read. 348 * @param off The offset at which to place bytes read. 349 * @param len The number of bytes to read. 350 * @return The number of bytes read, or -1 at EOF. 351 * @throws IOException on error 352 */ 353 @Override 354 public int read(final byte[] buf, int off, int len) throws IOException { 355 if (len == 0) { 356 return 0; 357 } 358 int totalRead = 0; 359 360 if (hasHitEOF || isClosed || entryOffset >= entrySize) { 361 return -1; 362 } 363 364 if (active == null) { 365 throw new IllegalStateException("No current dump entry"); 366 } 367 368 if (len + entryOffset > entrySize) { 369 len = (int) (entrySize - entryOffset); 370 } 371 372 while (len > 0) { 373 final int sz = Math.min(len, readBuf.length - recordOffset); 374 375 // copy any data we have 376 if (recordOffset + sz <= readBuf.length) { 377 System.arraycopy(readBuf, recordOffset, buf, off, sz); 378 totalRead += sz; 379 recordOffset += sz; 380 len -= sz; 381 off += sz; 382 } 383 384 // load next block if necessary. 385 if (len > 0) { 386 if (readIdx >= 512) { 387 final byte[] headerBytes = raw.readRecord(); 388 389 if (!DumpArchiveUtil.verify(headerBytes)) { 390 throw new InvalidFormatException(); 391 } 392 393 active = DumpArchiveEntry.parse(headerBytes); 394 readIdx = 0; 395 } 396 397 if (!active.isSparseRecord(readIdx++)) { 398 final int r = raw.read(readBuf, 0, readBuf.length); 399 if (r != readBuf.length) { 400 throw new EOFException(); 401 } 402 } else { 403 Arrays.fill(readBuf, (byte) 0); 404 } 405 406 recordOffset = 0; 407 } 408 } 409 410 entryOffset += totalRead; 411 412 return totalRead; 413 } 414 415 /** 416 * Read BITS segment. 417 */ 418 private void readBITS() throws IOException { 419 final byte[] buffer = raw.readRecord(); 420 421 if (!DumpArchiveUtil.verify(buffer)) { 422 throw new InvalidFormatException(); 423 } 424 425 active = DumpArchiveEntry.parse(buffer); 426 427 if (DumpArchiveConstants.SEGMENT_TYPE.BITS != active.getHeaderType()) { 428 throw new InvalidFormatException(); 429 } 430 431 // we don't do anything with this yet. 432 if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) { 433 throw new EOFException(); 434 } 435 readIdx = active.getHeaderCount(); 436 } 437 438 /** 439 * Read CLRI (deleted inode) segment. 440 */ 441 private void readCLRI() throws IOException { 442 final byte[] buffer = raw.readRecord(); 443 444 if (!DumpArchiveUtil.verify(buffer)) { 445 throw new InvalidFormatException(); 446 } 447 448 active = DumpArchiveEntry.parse(buffer); 449 450 if (DumpArchiveConstants.SEGMENT_TYPE.CLRI != active.getHeaderType()) { 451 throw new InvalidFormatException(); 452 } 453 454 // we don't do anything with this yet. 455 if (raw.skip((long) DumpArchiveConstants.TP_SIZE * active.getHeaderCount()) == -1) { 456 throw new EOFException(); 457 } 458 readIdx = active.getHeaderCount(); 459 } 460 461 /** 462 * Read directory entry. 463 */ 464 private void readDirectoryEntry(DumpArchiveEntry entry) throws IOException { 465 long size = entry.getEntrySize(); 466 boolean first = true; 467 468 while (first || DumpArchiveConstants.SEGMENT_TYPE.ADDR == entry.getHeaderType()) { 469 // read the header that we just peeked at. 470 if (!first) { 471 raw.readRecord(); 472 } 473 474 if (!names.containsKey(entry.getIno()) && DumpArchiveConstants.SEGMENT_TYPE.INODE == entry.getHeaderType()) { 475 pending.put(entry.getIno(), entry); 476 } 477 478 final int datalen = DumpArchiveConstants.TP_SIZE * entry.getHeaderCount(); 479 480 if (blockBuffer.length < datalen) { 481 blockBuffer = IOUtils.readRange(raw, datalen); 482 if (blockBuffer.length != datalen) { 483 throw new EOFException(); 484 } 485 } else if (raw.read(blockBuffer, 0, datalen) != datalen) { 486 throw new EOFException(); 487 } 488 489 int reclen = 0; 490 491 for (int i = 0; i < datalen - 8 && i < size - 8; i += reclen) { 492 final int ino = DumpArchiveUtil.convert32(blockBuffer, i); 493 reclen = DumpArchiveUtil.convert16(blockBuffer, i + 4); 494 if (reclen == 0) { 495 throw new DumpArchiveException("reclen cannot be 0"); 496 } 497 498 final byte type = blockBuffer[i + 6]; 499 500 final String name = DumpArchiveUtil.decode(zipEncoding, blockBuffer, i + 8, blockBuffer[i + 7]); 501 502 if (CURRENT_PATH_SEGMENT.equals(name) || PARENT_PATH_SEGMENT.equals(name)) { 503 // do nothing... 504 continue; 505 } 506 507 final Dirent d = new Dirent(ino, entry.getIno(), type, name); 508 509 /* 510 * if ((type == 4) && names.containsKey(ino)) { System.out.println("we already have ino: " + names.get(ino)); } 511 */ 512 513 names.put(ino, d); 514 515 // check whether this allows us to fill anything in the pending list. 516 for (final Map.Entry<Integer, DumpArchiveEntry> mapEntry : pending.entrySet()) { 517 final DumpArchiveEntry v = mapEntry.getValue(); 518 final String path = getPath(v); 519 if (path != null) { 520 v.setName(path); 521 v.setSimpleName(names.get(mapEntry.getKey()).getName()); 522 queue.add(v); 523 } 524 } 525 526 // remove anything that we found. (We can't do it earlier 527 // because of concurrent modification exceptions.) 528 queue.forEach(e -> pending.remove(e.getIno())); 529 } 530 531 final byte[] peekBytes = raw.peek(); 532 533 if (!DumpArchiveUtil.verify(peekBytes)) { 534 throw new InvalidFormatException(); 535 } 536 537 entry = DumpArchiveEntry.parse(peekBytes); 538 first = false; 539 size -= DumpArchiveConstants.TP_SIZE; 540 } 541 } 542 543}