use std::io::Read;
use std::{
fs,
io::ErrorKind::*,
path::{Path, PathBuf},
};
use ignore::WalkBuilder;
use rustc_hash::FxHashSet;
use crate::exit_codes::EXIT_BAD_ARGUMENTS;
use crate::options::FileArgument;
pub(crate) fn read_file_or_die(path: &FileArgument) -> Vec<u8> {
match read_file_arg(path) {
Ok(src) => src,
Err(e) => {
eprint_read_error(path, &e);
std::process::exit(EXIT_BAD_ARGUMENTS);
}
}
}
pub(crate) fn read_files_or_die(
lhs_path: &FileArgument,
rhs_path: &FileArgument,
missing_as_empty: bool,
) -> (Vec<u8>, Vec<u8>) {
let lhs_res = read_file_arg(lhs_path);
let rhs_res = read_file_arg(rhs_path);
match (lhs_res, rhs_res) {
(Ok(lhs_src), Ok(rhs_src)) => (lhs_src, rhs_src),
(Ok(lhs_src), Err(e)) if missing_as_empty && e.kind() == NotFound => (lhs_src, vec![]),
(Err(e), Ok(rhs_src)) if missing_as_empty && e.kind() == NotFound => (vec![], rhs_src),
(lhs_res, rhs_res) => {
if let Err(e) = lhs_res {
eprint_read_error(lhs_path, &e);
}
if let Err(e) = rhs_res {
eprint_read_error(rhs_path, &e);
}
std::process::exit(EXIT_BAD_ARGUMENTS);
}
}
}
fn read_file_arg(file_arg: &FileArgument) -> std::io::Result<Vec<u8>> {
match file_arg {
FileArgument::NamedPath(path) => fs::read(path),
FileArgument::Stdin => {
let stdin = std::io::stdin();
let mut handle = stdin.lock();
let mut bytes = vec![];
handle.read_to_end(&mut bytes)?;
Ok(bytes)
}
FileArgument::DevNull => {
Ok(vec![])
}
}
}
fn eprint_read_error(file_arg: &FileArgument, e: &std::io::Error) {
match e.kind() {
std::io::ErrorKind::NotFound => {
eprintln!("No such file: {}", file_arg);
}
std::io::ErrorKind::PermissionDenied => {
eprintln!("Permission denied when reading file: {}", file_arg);
}
_ => match file_arg {
FileArgument::NamedPath(path) if path.is_dir() => {
eprintln!("Expected a file, got a directory: {}", path.display());
}
_ => eprintln!("Could not read file: {} (error {:?})", file_arg, e.kind()),
},
};
}
pub(crate) fn read_or_die(path: &Path) -> Vec<u8> {
match fs::read(path) {
Ok(src) => src,
Err(e) => {
eprint_read_error(&FileArgument::NamedPath(path.to_path_buf()), &e);
std::process::exit(EXIT_BAD_ARGUMENTS);
}
}
}
fn has_utf16_byte_order_mark(bytes: &[u8]) -> bool {
matches!(bytes, [0xfe, 0xff, ..] | [0xff, 0xfe, ..])
}
fn u16_from_bytes(bytes: &[u8]) -> Vec<u16> {
let is_big_endian = match &bytes {
[0xfe, 0xff, ..] => true,
[0xff, 0xfe, ..] => false,
_ => false, };
bytes
.chunks_exact(2)
.map(|a| {
if is_big_endian {
u16::from_be_bytes([a[0], a[1]])
} else {
u16::from_le_bytes([a[0], a[1]])
}
})
.collect()
}
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum ProbableFileKind {
Text(String),
Binary,
}
pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) {
return ProbableFileKind::Text(valid_utf8_string.to_owned());
}
let mut magic_bytes = bytes;
if magic_bytes.len() > 1000 {
magic_bytes = &magic_bytes[..1000];
}
let mime = tree_magic_mini::from_u8(magic_bytes);
info!("MIME type detected: {}", mime);
match mime {
"application/pdf" => return ProbableFileKind::Binary,
"application/gzip" => return ProbableFileKind::Binary,
"application/zip" => return ProbableFileKind::Binary,
v if v.starts_with("image/") => return ProbableFileKind::Binary,
v if v.starts_with("audio/") => return ProbableFileKind::Binary,
v if v.starts_with("video/") => return ProbableFileKind::Binary,
v if v.starts_with("font/") => return ProbableFileKind::Binary,
_ => {}
}
let u16_values = u16_from_bytes(bytes);
let utf16_str_result = String::from_utf16(&u16_values);
match utf16_str_result {
Ok(valid_utf16_string) if has_utf16_byte_order_mark(bytes) => {
return ProbableFileKind::Text(valid_utf16_string);
}
_ => {}
}
let utf8_string = String::from_utf8_lossy(bytes).to_string();
let num_utf8_invalid = utf8_string
.chars()
.take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 10 {
info!(
"Input file is mostly valid UTF-8 (invalid characters: {})",
num_utf8_invalid
);
return ProbableFileKind::Text(utf8_string);
}
let utf16_string = String::from_utf16_lossy(&u16_values);
let num_utf16_invalid = utf16_string
.chars()
.take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf16_invalid <= 5 {
info!(
"Input file is mostly valid UTF-16 (invalid characters: {})",
num_utf16_invalid
);
return ProbableFileKind::Text(utf16_string);
}
ProbableFileKind::Binary
}
fn relative_file_paths_in_dir(dir: &Path) -> Vec<PathBuf> {
WalkBuilder::new(dir)
.hidden(false)
.build()
.filter_map(Result::ok)
.map(|entry| Path::new(entry.path()).to_owned())
.filter(|path| !path.is_dir())
.map(|path| path.strip_prefix(dir).unwrap().to_path_buf())
.collect()
}
pub(crate) fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec<PathBuf> {
let lhs_paths = relative_file_paths_in_dir(lhs_dir);
let rhs_paths = relative_file_paths_in_dir(rhs_dir);
let mut seen = FxHashSet::default();
let mut paths: Vec<PathBuf> = vec![];
let mut i = 0;
let mut j = 0;
loop {
match (lhs_paths.get(i), rhs_paths.get(j)) {
(Some(lhs_path), Some(rhs_path)) if lhs_path == rhs_path => {
if !seen.contains(lhs_path) {
paths.push(lhs_path.clone());
seen.insert(lhs_path);
}
i += 1;
j += 1;
}
(Some(lhs_path), Some(rhs_path)) => {
if seen.contains(lhs_path) {
i += 1;
} else if seen.contains(rhs_path) {
j += 1;
} else {
paths.push(lhs_path.clone());
paths.push(rhs_path.clone());
seen.insert(lhs_path);
seen.insert(rhs_path);
i += 1;
j += 1;
}
}
_ => break,
}
}
paths.extend(
lhs_paths[i..]
.iter()
.filter(|&path| !seen.contains(path))
.cloned(),
);
paths.extend(
rhs_paths[j..]
.iter()
.filter(|&path| !seen.contains(path))
.cloned(),
);
paths
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_plaintext_is_text() {
let s = "hello world";
assert!(matches!(
guess_content(s.as_bytes()),
ProbableFileKind::Text(_)
));
}
#[test]
#[ignore]
fn test_gzip_is_binary() {
let bytes = vec![
0x1f, 0x8b, 0x08, 0x00, 0x3a, 0xb0, 0x91, 0x63, 0x00, 0x03, 0x8b, 0x8e, 0xe5, 0x02,
0x00, 0x44, 0xd2, 0x68, 0x70, 0x03, 0x00, 0x00, 0x00,
];
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
}
#[test]
fn test_dex_is_binary() {
let bytes = vec![
0x34, 0x8a, 0x4b, 0x8f, 0x77, 0xa4, 0x4e, 0xb1, 0x31, 0x2d, 0x5f, 0xfb, 0x10, 0x08,
0xa8, 0x6b, 0x58, 0x06, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x05, 0x00, 0x00, 0x23, 0x00,
0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
];
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
}
#[test]
fn test_png_bytes_are_binary() {
let bytes = vec![
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48,
0x44, 0x52, 0x00, 0x00, 0x03, 0x76, 0x00, 0x00, 0x01, 0xed, 0x08, 0x06, 0x00, 0x00,
0x01, 0x22, 0x94, 0xdc, 0xb2, 0x00, 0x00, 0x1b, 0x5a, 0x7a, 0x54, 0x58, 0x74, 0x52,
0x61, 0x77, 0x20, 0x70, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x74, 0x79, 0x70,
0x65, 0x20, 0x65, 0x78, 0x69, 0x66, 0x00, 0x00, 0x78, 0xda, 0xad, 0x9b, 0x69, 0x76,
0x1c, 0xb7, 0x92, 0x85, 0xff, 0x63, 0x15, 0xbd, 0x04, 0x4c, 0x81, 0x61, 0x39, 0x18,
0xcf, 0x79, 0x3b, 0xe8, 0xe5, 0xf7, 0x77, 0xb3, 0x48, 0x8a, 0x92, 0xa8, 0xc1, 0xcf,
0x2d, 0xda, 0x2a, 0xba, 0x2a, 0x2b, 0x13, 0x19, 0xc3, 0x1d, 0x02, 0x69, 0x77, 0xfe,
0xf7, 0x3f, 0xd7, 0xfd, 0x0f, 0x7f, 0x4a, 0x6f, 0xc5, 0x65, 0xab, 0xad, 0xf4, 0x52,
0x3c, 0x7f, 0x72, 0xcf, 0x3d, 0x0e, 0x7e, 0x69, 0xfe, 0xdb, 0x9f, 0xf3, 0xf6, 0xda,
0xbd, 0xbf, 0xfa, 0x3d, 0xf3, 0x6f, 0x78, 0x7b, 0x2f, 0x7c, 0x1c, 0x15, 0xdc, 0x77,
0x1f, 0xbc, 0xff, 0x16, 0xf8, 0xf9, 0xf4, 0x7e, 0x7b, 0x3f, 0x59, 0xe4, 0x9d, 0xc4,
0x6b, 0x7a, 0x7b, 0xff, 0xfd, 0x5b, 0x49, 0x27, 0x4a, 0x9f, 0x4e, 0x54, 0x3e, 0x5e,
0xc3, 0x57, 0xef, 0x07, 0xfb, 0xe1, 0xfd, 0xf4, 0x71, 0xf9, 0xf8, 0xdd, 0x8a, 0x5a,
0xf9, 0xb8, 0x72, 0xfc, 0xfc, 0xfe, 0x3c, 0x61, 0xf8, 0xcf, 0x7f, 0xda, 0xb7, 0x7f,
0xef, 0xdd, 0xed, 0x72, 0xcf, 0x4e, 0x47, 0xe7, 0x91, 0x0b, 0xf1, 0x29, 0x6f, 0x37,
0xf5, 0x7e, 0x8b, 0xcf, 0x6f, 0x1c, 0x38, 0x09, 0x4c, 0x7a, 0xbe, 0x56, 0xf8, 0xa9,
0xfc, 0x6b, 0xfc, 0x5e, 0x9f, 0x9f, 0xce, 0x4f, 0xf3, 0xc3, 0x2f, 0x17, 0xb2, 0xdf,
0x7e, 0xf9, 0xc9, 0xcf, 0x0a, 0x3d, 0xc4, 0x90, 0xfc, 0x0d, 0x39, 0xec, 0x30, 0xc2,
0x0d, 0xe7, 0x79, 0x5d, 0x61, 0xb1, 0xc4, 0x1c, 0x4f, 0xac, 0xbc, 0xc6, 0xb8, 0x62,
0x7a, 0xde, 0x6b, 0xa9, 0xc6, 0x1e, 0x57, 0xf2, 0x29, 0xa4, 0xec, 0x52, 0x4e, 0x39,
0xdc, 0x58, 0x53, 0x4f, 0x3b, 0xb5, 0x14, 0xd3, 0x8a, 0x27, 0x25, 0xde, 0x8d, 0x1f,
0x6b, 0x09, 0xcf, 0x75, 0xfb, 0x73, 0xb9, 0x15, 0x1a, 0x17, 0xde, 0x81, 0x23, 0x63,
0xe0, 0x64, 0x81, 0x6f, 0x3c, 0x3f, 0xee, 0xfd, 0x97, 0x7f, 0xfb, 0xf3, 0xe5, 0x89,
0xee, 0x5d, 0x0a, 0x51, 0x50, 0x30, 0x49, 0x7d, 0x78, 0x25, 0x38, 0x46, 0xa5, 0x21,
0x28, 0x8a, 0x49, 0x7f, 0x73, 0x14, 0x09, 0x09, 0xf7, 0xbd, 0x8e, 0xec, 0x09, 0xf0,
0xfb, 0xcf, 0x8f, 0x7f, 0x48, 0x2c, 0x27, 0xc9, 0x1c, 0xa5, 0x30, 0x37, 0x6e, 0x70,
0xf8, 0xf9, 0x3a, 0xc5, 0xb4, 0xf0, 0xad, 0xb6, 0x92, 0x7b, 0x12, 0x9d, 0x38, 0xd0,
0x78, 0xcd, 0xaf, 0x2f, 0xd7, 0xfd, 0x76, 0x02, 0x42, 0xc4, 0xb5, 0x8d, 0xc5, 0x84,
0x44, 0x06, 0x7c, 0x09, 0xc9, 0x42, 0x09, 0xbe, 0xc6, 0x58, 0x43, 0x20, 0x8e, 0x8d,
0xfc, 0x0c, 0x56, 0x1e, 0x93, 0xcb, 0x71, 0x92, 0x82, 0x60, 0x16, 0x37,
];
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
}
}