mimesniff

#!/bin/sh
# mimesniff - sniff MIME type of content given on standard input
# based on MIME Sniffing Standard <http://mimesniff.spec.whatwg.org/>

# A resource header is the byte sequence at the beginning of a
# resource, as determined by reading the resource header.
#
# To read the resource header, perform the following steps:
#
# 1. Let buffer be a byte sequence.
#
# 2. Read bytes of the resource into buffer until one of the following
#    conditions is met:
#
#    - the end of the resource is reached.
#    - the number of bytes in buffer is greater than or equal to 512.

data_to_hex_tuples() ( hexdump -v -e '1/1 "%02X" " "' )
resource_header="$(head -c512 | data_to_hex_tuples)"

mask() (
  printf "%s %s" "$1" "$2" | \
    awk '{n=NF/2; for (i=1;i<=n;i++) print "0x"$i, "0x"$(n+i) }' | \
      while read a b; do
        printf '%02X ' $(( $a & $b ));
      done | sed -e 's/ *$//'
)

match() {
  local mime_type="$1"
  local byte_pattern="$2"
  local pattern_mask="$3"
  local leading_bytes_ignored="$4"

  # Let the byte sequence to be matched be the resource header.

  local byte_sequence="$(printf "%s" "$resource_header" | head -c${#byte_pattern})"
  if [ "$leading_bytes_ignored" != "" ]; then

    # A whitespace byte (0xWS) is any of the following bytes: 0x09
    # (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP).

    if [ "$leading_bytes_ignored" = "WS" ]; then
      sed_code="s/^\(09\ \|0A\ \|0C\ \|0D\ \|20\ \)\+//"
    else
      sed_code="$( printf 's/^\(%s\ \)\+//' \
       "$( printf "%s" "$leading_bytes_ignored" | sed -e 's/\ /\\\ \\|/g' )"
      )"
    fi
    old_resource_header="$resource_header"
    resource_header="$(printf "%s" "$resource_header" | sed -e "$sed_code")"
    match $mime_type "$byte_pattern" "$pattern_mask"
    resource_header="$old_resource_header"
    return
  fi
  case "$byte_pattern" in *TT*)

    # A tag-terminating byte (0xTT) is any of the following bytes:
    # 0x20 (SP), 0x3E (">").

    match $mime_type "$(printf "%s" "$byte_pattern" | sed -e 's/TT/20/g')" "$pattern_mask"
    match $mime_type "$(printf "%s" "$byte_pattern" | sed -e 's/TT/3E/g')" "$pattern_mask"
    return
    ;;
  esac

  # Let masked-data be the result of applying the bitwise AND operator
  # to sequence[s] and mask[m].
  if [ -n "$pattern_mask" ]; then
    masked_data="$(mask "$byte_sequence" "$pattern_mask")"
  else
    masked_data=$byte_sequence
  fi

  # Let the byte pattern to be matched against be the value in the
  # first column of the current row.

  if [ "$byte_pattern" = "$masked_data" ]; then

    # If pattern-matched is true, the sniffed MIME type is the value
    # in the fourth column of the current row. Abort these steps.

    printf "%s" "$mime_type"
    exit 0
  fi
}

# match unknown MIME type

match text/html "3C 21 44 4F 43 54 59 50 45 20 48 54 4D 4C TT" "FF FF DF DF DF DF DF DF DF FF DF DF DF DF FF" "WS" # <!DOCTYPE HTML
match text/html "3C 48 54 4D 4C TT" "FF DF DF DF DF FF" "WS" # <HTML
match text/html "3C 48 45 41 44 TT" "FF DF DF DF DF FF" "WS" # <HEAD
match text/html "3C 53 43 52 49 50 54 TT" "FF DF DF DF DF DF DF FF" "WS" # <SCRIPT
match text/html "3C 49 46 52 41 4D 45 TT" "FF DF DF DF DF DF DF FF" "WS" # <IFRAME
match text/html "3C 48 31 TT" "FF DF FF FF" "WS" # <H1
match text/html "3C 44 49 56 TT" "FF DF DF DF FF" "WS" # <DIV
match text/html "3C 46 4F 4E 54 TT" "FF DF DF DF DF FF" "WS" # <FONT
match text/html "3C 54 41 42 4C 45 TT" "FF DF DF DF DF DF FF" "WS" # <TABLE
match text/html "3C 41 TT" "FF DF FF" "WS" # <A
match text/html "3C 53 54 59 4C 45 TT" "FF DF DF DF DF DF FF" "WS" # <STYLE
match text/html "3C 54 49 54 4C 45 TT" "FF DF DF DF DF DF FF" "WS" # <TITLE
match text/html "3C 42 TT" "FF DF FF" "WS" # <B
match text/html "3C 42 4F 44 59 TT" "FF DF DF DF DF FF" "WS" # <BODY
match text/html "3C 42 52 TT" "FF DF DF FF" "WS" # <BR
match text/html "3C 50 TT" "FF DF FF" "WS" # <P
match text/html "3C 21 2D 2D TT" "" "WS" # <!--
match text/xml "3C 3F 78 6D 6C" "" "WS" # <?xml
match application/pdf "25 50 44 46 2D"

match application/postscript "25 21 50 53 2D 41 64 6F 62 65 2D"
match text/plain "FE FF 00 00" "FF FF 00 00 " # UTF-16BE BOM
match text/plain "FF FE 00 00" "FF FF 00 00" # UTF-16LE BOM
match text/plain "EF BB BF 00" "FF FF FF 00" # UTF-8 BOM

# Let matched-type be the result of executing the image type pattern
# matching algorithm with the resource header as the byte sequence to
# be matched.

match image/x-icon "00 00 01 00" # Windows Icon
match image/x-icon "00 00 02 00" # Windows Cursor
match image/bmp "42 4D" # BM
match image/gif "47 49 46 38 37 61" # GIF87a
match image/gif "47 49 46 38 39 61" # GIF89a
match image/webp "52 49 46 46 00 00 00 00 57 45 42 50 56 50" "FF FF FF FF 00 00 00 00 FF FF FF FF FF FF"
match image/png "89 50 4E 47 0D 0A 1A 0A"
match image/jpeg "FF D8 FF"

# Let matched-type be the result of executing the audio or video type
# pattern matching algorithm with the resource header as the byte
# sequence to be matched.

match video/webm "1A 45 DF A3"
match audio/basic "2E 73 6E 64"
match audio/aiff "46 4F 52 4D 00 00 00 00 41 49 46 46" "FF FF FF FF 00 00 00 00 FF FF FF FF"
match audio/mpeg "49 44 33"
match application/ogg "4F 67 67 53 00"
match audio/midi "4D 54 68 64 00 00 00 06"
match video/avi "52 49 46 46 00 00 00 00 41 56 49 20" "FF FF FF FF 00 00 00 00 FF FF FF FF"
match audio/wave "52 49 46 46 00 00 00 00 57 41 56 45" "FF FF FF FF 00 00 00 00 FF FF FF FF"

# Let matched-type be the result of executing the archive type pattern
# matching algorithm with the resource header as the byte sequence to
# be matched.

match application/x-gzip "1F 8B 08"
match application/zip "50 4B 03 04"
match application/x-rar-compressed "52 61 72 20 1A 07 00"

# If the resource header contains no binary data bytes, the sniffed
# MIME type is "text/plain".

# A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS),
# the byte 0x0B (VT), a byte in the range 0x0E to 0x1A (SO to SUB), or
# a byte in the range 0x1C to 0x1F (FS to US).

printf '%s' "$resource_header" | \
  grep -q '\(00\|01\|02\|03\|04\|05\|06\|07\|08\|0B\|0E\|0F\|10\|11\|12\|13\|14\|15\|16\|17\|18\|19\|1A\|1C\|1D\|1E\|1F\)' && \
  printf 'application/octet-stream' ||
  printf 'text/plain'