001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018/* 019 * The MIT License (MIT) 020 * Copyright (c) 2014 Martin Kleppmann 021 * 022 * Permission is hereby granted, free of charge, to any person obtaining a copy 023 * of this software and associated documentation files (the "Software"), to deal 024 * in the Software without restriction, including without limitation the rights 025 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 026 * copies of the Software, and to permit persons to whom the Software is 027 * furnished to do so, subject to the following conditions: 028 * 029 * The above copyright notice and this permission notice shall be included in 030 * all copies or substantial portions of the Software. 031 * 032 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 033 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 034 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 035 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 036 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 037 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 038 * THE SOFTWARE. 039 */ 040package org.apache.hadoop.hbase.test.util.warc; 041 042import java.io.ByteArrayOutputStream; 043import java.io.DataInput; 044import java.io.DataOutput; 045import java.io.IOException; 046import java.util.LinkedHashMap; 047import java.util.Map; 048import java.util.regex.Pattern; 049 050/** 051 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord} by parsing 052 * it out of a {@link DataInput} stream. 053 * <p/> 054 * The file format is documented in the 055 * <a href="http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf">ISO Standard</a>. In 056 * a nutshell, it's a textual format consisting of lines delimited by `\r\n`. Each record has the 057 * following structure: 058 * <ol> 059 * <li>A line indicating the WARC version number, such as `WARC/1.0`.</li> 060 * <li>Several header lines (in key-value format, similar to HTTP or email headers), giving 061 * information about the record. The header is terminated by an empty line. 062 * <li>A body consisting of raw bytes (the number of bytes is indicated in one of the headers). 063 * <li>A final separator of `\r\n\r\n` before the next record starts. 064 * </ol> 065 * There are various different types of records, as documented on {@link Header#getRecordType()}. 066 */ 067public class WARCRecord { 068 069 public static final String WARC_VERSION = "WARC/1.0"; 070 private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+"); 071 private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*"); 072 private static final String CRLF = "\r\n"; 073 private static final byte[] CRLF_BYTES = { 13, 10 }; 074 075 private final Header header; 076 private final byte[] content; 077 078 /** 079 * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream. 080 * @param in The input source from which one record will be read. 081 */ 082 public WARCRecord(DataInput in) throws IOException { 083 header = readHeader(in); 084 content = new byte[header.getContentLength()]; 085 in.readFully(content); 086 readSeparator(in); 087 } 088 089 private static Header readHeader(DataInput in) throws IOException { 090 String versionLine = readLine(in); 091 if (!VERSION_PATTERN.matcher(versionLine).matches()) { 092 throw new IllegalStateException("Expected WARC version, but got: " + versionLine); 093 } 094 095 LinkedHashMap<String, String> headers = new LinkedHashMap<String, String>(); 096 String line, fieldName = null; 097 098 do { 099 line = readLine(in); 100 if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) { 101 headers.put(fieldName, headers.get(fieldName) + line); 102 } else if (!line.isEmpty()) { 103 String[] field = line.split(":", 2); 104 if (field.length < 2) { 105 throw new IllegalStateException("Malformed header line: " + line); 106 } 107 fieldName = field[0].trim(); 108 headers.put(fieldName, field[1].trim()); 109 } 110 } while (!line.isEmpty()); 111 112 return new Header(headers); 113 } 114 115 private static String readLine(DataInput in) throws IOException { 116 ByteArrayOutputStream out = new ByteArrayOutputStream(); 117 boolean seenCR = false, seenCRLF = false; 118 while (!seenCRLF) { 119 byte b = in.readByte(); 120 if (!seenCR && b == 13) { 121 seenCR = true; 122 } else if (seenCR && b == 10) { 123 seenCRLF = true; 124 } else { 125 seenCR = false; 126 out.write(b); 127 } 128 } 129 return out.toString("UTF-8"); 130 } 131 132 private static void readSeparator(DataInput in) throws IOException { 133 byte[] sep = new byte[4]; 134 in.readFully(sep); 135 if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) { 136 throw new IllegalStateException( 137 String.format("Expected final separator CR LF CR LF, but got: %d %d %d %d", sep[0], sep[1], 138 sep[2], sep[3])); 139 } 140 } 141 142 /** 143 * Returns the parsed header structure of the WARC record. 144 */ 145 public Header getHeader() { 146 return header; 147 } 148 149 /** 150 * Returns the body of the record, as an unparsed raw array of bytes. The content of the body 151 * depends on the type of record (see {@link Header#getRecordType()}). For example, in the case of 152 * a `response` type header, the body consists of the full HTTP response returned by the server 153 * (HTTP headers followed by the body). 154 */ 155 public byte[] getContent() { 156 return content; 157 } 158 159 /** 160 * Writes this record to a {@link DataOutput} stream. The output may, in some edge cases, be not 161 * byte-for-byte identical to what was parsed from a {@link DataInput}. However it has the same 162 * meaning and should not lose any information. 163 * @param out The output stream to which this record should be appended. 164 */ 165 public void write(DataOutput out) throws IOException { 166 header.write(out); 167 out.write(CRLF_BYTES); 168 out.write(content); 169 out.write(CRLF_BYTES); 170 out.write(CRLF_BYTES); 171 } 172 173 /** 174 * Returns a human-readable string representation of the record. 175 */ 176 @Override 177 public String toString() { 178 return header.toString(); 179 } 180 181 /** 182 * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number of headers 183 * in key-value format, where some header keys are standardised, but nonstandard ones can be 184 * added. 185 * <p/> 186 * The documentation of the methods in this class is excerpted from the 187 * <a href="http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf">WARC 1.0 188 * specification</a>. Please see the specification for more detail. 189 */ 190 public final static class Header { 191 private final Map<String, String> fields; 192 193 private Header(Map<String, String> fields) { 194 this.fields = fields; 195 } 196 197 /** 198 * Returns the type of WARC record (the value of the `WARC-Type` header field). WARC 1.0 defines 199 * the following record types: (for full definitions, see the 200 * <a href="http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf">spec</a>. 201 * <ul> 202 * <li>`warcinfo`: Describes the records that follow it, up through end of file, end of input, 203 * or until next `warcinfo` record. Typically, this appears once and at the beginning of a WARC 204 * file. For a web archive, it often contains information about the web crawl which generated 205 * the following records. 206 * <p/> 207 * The format of this descriptive record block may vary, though the use of the 208 * `"application/warc-fields"` content-type is recommended. (...)</li> 209 * <li>`response`: The record should contain a complete scheme-specific response, including 210 * network protocol information where possible. For a target-URI of the `http` or `https` 211 * schemes, a `response` record block should contain the full HTTP response received over the 212 * network, including headers. That is, it contains the 'Response' message defined by section 6 213 * of HTTP/1.1 (RFC2616). 214 * <p/> 215 * The WARC record's Content-Type field should contain the value defined by HTTP/1.1, 216 * `"application/http;msgtype=response"`. The payload of the record is defined as its 217 * 'entity-body' (per RFC2616), with any transfer-encoding removed.</li> 218 * <li>`resource`: The record contains a resource, without full protocol response information. 219 * For example: a file directly retrieved from a locally accessible repository or the result of 220 * a networked retrieval where the protocol information has been discarded. For a target-URI of 221 * the `http` or `https` schemes, a `resource` record block shall contain the returned 222 * 'entity-body' (per RFC2616, with any transfer-encodings removed), possibly truncated.</li> 223 * <li>`request`: The record holds the details of a complete scheme-specific request, including 224 * network protocol information where possible. For a target-URI of the `http` or `https` 225 * schemes, a `request` record block should contain the full HTTP request sent over the network, 226 * including headers. That is, it contains the 'Request' message defined by section 5 of 227 * HTTP/1.1 (RFC2616). 228 * <p/> 229 * The WARC record's Content-Type field should contain the value defined by HTTP/1.1, 230 * `"application/http;msgtype=request"`. The payload of a `request` record with a target-URI of 231 * scheme `http` or `https` is defined as its 'entity-body' (per RFC2616), with any 232 * transfer-encoding removed.</li> 233 * <li>`metadata`: The record contains content created in order to further describe, explain, or 234 * accompany a harvested resource, in ways not covered by other record types. A `metadata` 235 * record will almost always refer to another record of another type, with that other record 236 * holding original harvested or transformed content. 237 * <p/> 238 * The format of the metadata record block may vary. The `"application/warc-fields"` format may 239 * be used.</li> 240 * <li>`revisit`: The record describes the revisitation of content already archived, and might 241 * include only an abbreviated content body which has to be interpreted relative to a previous 242 * record. Most typically, a `revisit` record is used instead of a `response` or `resource` 243 * record to indicate that the content visited was either a complete or substantial duplicate of 244 * material previously archived. 245 * <p/> 246 * A `revisit` record shall contain a WARC-Profile field which determines the interpretation of 247 * the record's fields and record block. Please see the specification for details.</li> 248 * <li>`conversion`: The record shall contain an alternative version of another record's content 249 * that was created as the result of an archival process. Typically, this is used to hold 250 * content transformations that maintain viability of content after widely available rendering 251 * tools for the originally stored format disappear. As needed, the original content may be 252 * migrated (transformed) to a more viable format in order to keep the information usable with 253 * current tools while minimizing loss of information.</li> 254 * <li>`continuation`: Record blocks from `continuation` records must be appended to 255 * corresponding prior record blocks (eg. from other WARC files) to create the logically 256 * complete full-sized original record. That is, `continuation` records are used when a record 257 * that would otherwise cause a WARC file size to exceed a desired limit is broken into 258 * segments. A continuation record shall contain the named fields `WARC-Segment-Origin-ID` and 259 * `WARC-Segment-Number`, and the last `continuation` record of a series shall contain a 260 * `WARC-Segment-Total-Length` field. Please see the specification for details.</li> 261 * <li>Other record types may be added in future, so this list is not exclusive.</li> 262 * </ul> 263 * @return The record's `WARC-Type` header field, as a string. 264 */ 265 public String getRecordType() { 266 return fields.get("WARC-Type"); 267 } 268 269 /** 270 * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described in the W3C 271 * profile of ISO8601. The timestamp shall represent the instant that data capture for record 272 * creation began. Multiple records written as part of a single capture event shall use the same 273 * WARC-Date, even though the times of their writing will not be exactly synchronized. 274 * @return The record's `WARC-Date` header field, as a string. 275 */ 276 public String getDateString() { 277 return fields.get("WARC-Date"); 278 } 279 280 /** 281 * An identifier assigned to the current record that is globally unique for its period of 282 * intended use. No identifier scheme is mandated by this specification, but each record-id 283 * shall be a legal URI and clearly indicate a documented and registered scheme to which it 284 * conforms (e.g., via a URI scheme prefix such as `http:` or `urn:`). 285 * @return The record's `WARC-Record-ID` header field, as a string. 286 */ 287 public String getRecordID() { 288 return fields.get("WARC-Record-ID"); 289 } 290 291 /** 292 * The MIME type (RFC2045) of the information contained in the record's block. For example, in 293 * HTTP request and response records, this would be `application/http` as per section 19.1 of 294 * RFC2616 (or `application/http; msgtype=request` and `application/http; msgtype=response` 295 * respectively). 296 * <p/> 297 * In particular, the content-type is *not* the value of the HTTP Content-Type header in an HTTP 298 * response, but a MIME type to describe the full archived HTTP message (hence 299 * `application/http` if the block contains request or response headers). 300 * @return The record's `Content-Type` header field, as a string. 301 */ 302 public String getContentType() { 303 return fields.get("Content-Type"); 304 } 305 306 /** 307 * The original URI whose capture gave rise to the information content in this record. In the 308 * context of web harvesting, this is the URI that was the target of a crawler's retrieval 309 * request. For a `revisit` record, it is the URI that was the target of a retrieval request. 310 * Indirectly, such as for a `metadata`, or `conversion` record, it is a copy of the 311 * `WARC-Target-URI` appearing in the original record to which the newer record pertains. The 312 * URI in this value shall be properly escaped according to RFC3986, and written with no 313 * internal whitespace. 314 * @return The record's `WARC-Target-URI` header field, as a string. 315 */ 316 public String getTargetURI() { 317 return fields.get("WARC-Target-URI"); 318 } 319 320 /** 321 * The number of bytes in the body of the record, similar to RFC2616. 322 * @return The record's `Content-Length` header field, parsed into an int. 323 */ 324 public int getContentLength() { 325 String lengthStr = fields.get("Content-Length"); 326 if (lengthStr == null) { 327 throw new IllegalStateException("Missing Content-Length header"); 328 } 329 try { 330 return Integer.parseInt(lengthStr); 331 } catch (NumberFormatException e) { 332 throw new IllegalStateException("Malformed Content-Length header: " + lengthStr); 333 } 334 } 335 336 /** 337 * Returns the value of a selected header field, or null if there is no header with that field 338 * name. 339 * @param field The name of the header to return (case-sensitive). 340 * @return The value associated with that field name, or null if not present. 341 */ 342 public String getField(String field) { 343 return fields.get(field); 344 } 345 346 /** 347 * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format. 348 * @param out The data output to which the header should be written. 349 */ 350 public void write(DataOutput out) throws IOException { 351 out.write(toString().getBytes("UTF-8")); 352 } 353 354 /** 355 * Formats this header in WARC/1.0 format, consisting of a version line followed by 356 * colon-delimited key-value pairs, and `\r\n` line endings. 357 */ 358 @Override 359 public String toString() { 360 StringBuilder buf = new StringBuilder(); 361 buf.append(WARC_VERSION); 362 buf.append(CRLF); 363 for (Map.Entry<String, String> field : fields.entrySet()) { 364 buf.append(field.getKey()); 365 buf.append(": "); 366 buf.append(field.getValue()); 367 buf.append(CRLF); 368 } 369 return buf.toString(); 370 } 371 } 372 373}