5
\$\begingroup\$

This is my first Rust project (I'm primarily a C guy). I want to know if my code is properly idiomatic (Rusty?). Also, are there any inefficiencies?

The code defines an iterator (Searcher) that takes a regex, a starting directory, and an optional maximum search depth. It generates locations of regex matches within the file system.

use std::fs; use std::io; use std::io::BufRead; use std::iter; use std::path; use std::vec::Vec; use regex; pub struct Location { pub file: path::PathBuf, pub line: usize, pub text: String, } struct FileScanner<'a> { path: path::PathBuf, pattern: &'a regex::Regex, lines: iter::Enumerate<io::Lines<io::BufReader<fs::File>>>, } fn is_line_printable(line: &str) -> bool { line.chars() .all(|c| c.is_ascii_graphic() || c.is_whitespace()) } impl<'a> FileScanner<'a> { fn build(path: path::PathBuf, pattern: &'a regex::Regex) -> Option<Self> { let handle = match fs::File::open(&path) { Ok(h) => h, Err(_) => return None, }; let reader = io::BufReader::new(handle); Some(FileScanner { path, pattern, lines: reader.lines().into_iter().enumerate(), }) } } impl<'a> Iterator for FileScanner<'a> { type Item = Location; fn next(&mut self) -> Option<Location> { loop { let (index, line) = match self.lines.next() { Some((i, Ok(l))) => (i, l), _ => return None, }; if !is_line_printable(&line) { return None; } let pattern_match = match self.pattern.find(&line) { Some(m) => m, None => continue, }; let start = pattern_match.start(); let end = pattern_match.end(); return Some(Location { file: self.path.to_path_buf(), line: index + 1, text: String::from(&line[start..end]), }); } } } pub struct Searcher<'a> { pattern: &'a regex::Regex, max_depth: Option<u8>, readers: Vec<fs::ReadDir>, current_scanner: Option<FileScanner<'a>>, } impl<'a> Searcher<'a> { pub fn build( pattern: &'a regex::Regex, directory: &'a path::Path, depth: Option<u8>, ) -> Result<Self, String> { match depth { Some(0) => return Err(String::from("Depth cannot be 0")), _ => (), }; let reader = match fs::read_dir(directory) { Ok(r) => r, Err(error) => return Err(error.to_string()), }; let readers = vec![reader]; Ok(Searcher { pattern, max_depth: depth, readers, current_scanner: None, }) } fn push_directory(&mut self, directory: path::PathBuf) { match self.max_depth { Some(depth) if usize::from(depth) == self.readers.len() => return, _ => (), } let reader = match fs::read_dir(directory) { Ok(r) => r, Err(_) => return, }; self.readers.push(reader); } fn next_match_from_file(&mut self) -> Option<Location> { let scanner = match &mut self.current_scanner { Some(s) => s, None => return None, }; let location = scanner.next(); if location.is_none() { self.current_scanner = None; } location } } impl<'a> Iterator for Searcher<'a> { type Item = Location; fn next(&mut self) -> Option<Location> { let location = self.next_match_from_file(); if location.is_some() { return location; } while self.readers.len() > 0 { let current_reader = self.readers.last_mut().unwrap(); let entry = match current_reader.next() { Some(Ok(ent)) => ent, Some(Err(_)) | None => { self.readers.pop(); continue; } }; let path = entry.path(); if path.is_dir() { self.push_directory(path); continue; } self.current_scanner = FileScanner::build(path, self.pattern); let location = self.next_match_from_file(); if location.is_some() { return location; } } None } } 
\$\endgroup\$

    1 Answer 1

    3
    \$\begingroup\$

    You did a good job, your code already leverages on main rust features, but there are some suggestions to make your code more idiomatic and efficient:

    Idiomatic improvements

    1. Use ? for error propagation

    Instead of manual match expressions:

    // Current approach let reader = match fs::read_dir(directory) { Ok(r) => r, Err(error) => return Err(error.to_string()), }; // More idiomatic let reader = fs::read_dir(directory).map_err(|e| e.to_string())?; 

    2. Use if let for simpler pattern matching (it's cleaner)

    // Instead of match depth { Some(0) => return Err(String::from("Depth cannot be 0")), _ => (), }; // More idiomatic if let Some(0) = depth { return Err("Depth cannot be 0".to_string()); } 

    3. Avoid unnecessary empty match arms

    // Instead of match self.max_depth { Some(depth) if usize::from(depth) == self.readers.len() => return, _ => (), } // Better if let Some(depth) = self.max_depth { if usize::from(depth) == self.readers.len() { return; } } 

    Performance considerations

    1. Binary file detection: Your is_line_printable function reads every character of every line. Consider checking a sample of bytes at the beginning of the file instead.

    2. Path cloning: You're creating a new PathBuf for each match with self.path.to_path_buf(). This is sometimes unavoidable (unfortunately) but worth noting.

    3. File filtering: Add extension filtering to avoid scanning binary files altogether.

    4. Memory usage: The current approach holds file handles open until matches are found, which is generally fine but could be improved.

    Here's an improved version of your code:

    use std::fs; use std::io::{self, BufRead}; use std::iter; use std::path::{Path, PathBuf}; use std::vec::Vec; use regex::Regex; pub struct Location { pub file: PathBuf, pub line: usize, pub line_content: String, // store whole line for context pub match_text: String, // store matched text separately pub match_span: (usize, usize), // start and end positions } struct FileScanner<'a> { path: PathBuf, pattern: &'a Regex, lines: iter::Enumerate<io::Lines<io::BufReader<fs::File>>>, } // improved binary detection - (check first few bytes) fn is_likely_binary(path: &Path) -> io::Result<bool> { let file = fs::File::open(path)?; let mut buffer = [0; 512]; // sample first 512 bytes let bytes_read = file.take(512).read(&mut buffer)?; // check for null bytes or high concentration of non-ASCII chars let non_text_chars = buffer[..bytes_read] .iter() .filter(|&&b| b == 0 || b > 127) .count(); Ok(non_text_chars > bytes_read / 4) // > 25% non-text chars indicates binary } impl<'a> FileScanner<'a> { fn new(path: PathBuf, pattern: &'a Regex) -> io::Result<Self> { let handle = fs::File::open(&path)?; let reader = io::BufReader::new(handle); Ok(FileScanner { path, pattern, lines: reader.lines().enumerate(), }) } fn build(path: PathBuf, pattern: &'a Regex) -> Option<Self> { if is_likely_binary(&path).unwrap_or(true) { return None; // skip } Self::new(path).ok() } } impl<'a> Iterator for FileScanner<'a> { type Item = Location; fn next(&mut self) -> Option<Location> { while let Some((index, line_result)) = self.lines.next() { let line = match line_result { Ok(l) => l, Err(_) => continue, // skip lines with IO errors instead of aborting }; if let Some(pattern_match) = self.pattern.find(&line) { let start = pattern_match.start(); let end = pattern_match.end(); return Some(Location { file: self.path.clone(), // clone is necessary here line: index + 1, line_content: line.clone(), match_text: line[start..end].to_string(), match_span: (start, end), }); } } None } } pub struct Searcher<'a> { pattern: &'a Regex, max_depth: Option<u8>, readers: Vec<fs::ReadDir>, current_scanner: Option<FileScanner<'a>>, allowed_extensions: Option<Vec<String>>, // add extension-based filtering } impl<'a> Searcher<'a> { pub fn new( pattern: &'a Regex, directory: &Path, depth: Option<u8>, extensions: Option<Vec<String>>, ) -> Result<Self, String> { if let Some(0) = depth { return Err("Depth cannot be 0".to_string()); } let reader = fs::read_dir(directory).map_err(|e| e.to_string())?; Ok(Searcher { pattern, max_depth: depth, readers: vec![reader], current_scanner: None, allowed_extensions: extensions, }) } pub fn build( pattern: &'a Regex, directory: &Path, depth: Option<u8>, ) -> Result<Self, String> { Self::new(pattern, directory, depth, None) } fn should_scan_file(&self, path: &Path) -> bool { if let Some(ref extensions) = self.allowed_extensions { if let Some(ext) = path.extension().and_then(|e| e.to_str()) { return extensions.iter().any(|e| e == ext); } return false; // no extension } true // no extension filtering } fn push_directory(&mut self, directory: PathBuf) { // early return if we've reached max depth if let Some(depth) = self.max_depth { if usize::from(depth) == self.readers.len() { return; } } if let Ok(reader) = fs::read_dir(directory) { self.readers.push(reader); } } fn next_match_from_file(&mut self) -> Option<Location> { let location = self.current_scanner.as_mut()?.next(); if location.is_none() { self.current_scanner = None; } location } } impl<'a> Iterator for Searcher<'a> { type Item = Location; fn next(&mut self) -> Option<Location> { if let Some(location) = self.next_match_from_file() { return Some(location); } while let Some(reader) = self.readers.last_mut() { // here we search through all directories match reader.next() { Some(Ok(entry)) => { let path = entry.path(); if path.is_dir() { self.push_directory(path); } else if self.should_scan_file(&path) { self.current_scanner = FileScanner::build(path, self.pattern); if let Some(location) = self.next_match_from_file() { return Some(location); } } } _ => { // if it reaches here it means there's an error self.readers.pop(); } } } None } } 
    \$\endgroup\$

      Start asking to get answers

      Find the answer to your question by asking.

      Ask question

      Explore related questions

      See similar questions with these tags.