I've been starting to learn Rust by going through some of the Rosalind String Algorithm problems.
If anyone would like to point out possible improvements, or anything else, that would be great. There are a couple of things I'm especially curious about noted in the subs and lcsm solutions.
main.rs:
mod utils; mod string_algorithms; fn main() { //string_algorithms::dna(); //string_algorithms::rna(); //string_algorithms::revc(); //string_algorithms::gc(); //string_algorithms::subs(); string_algorithms::lcsm(); println!("done!"); }
utils.rs:
use std::io::Read; use std::io::Write; use std::fs::File; pub fn read_file(abbr: &str) -> String { let name = "Input/rosalind_".to_string() + abbr + ".txt"; let mut input = String::new(); File::open(name) .unwrap() .read_to_string(&mut input) .unwrap(); input } use std::collections::BTreeMap; use std::char; #[allow(dead_code)] pub fn read_fasta(abbr: &str) -> BTreeMap<String, String> { let input = read_file(abbr); // read format: // >NAME\n // STRBLAHBLAHBLAH\n // STILLTHESAMESTR\n // >NAME\n // ... let mut result = BTreeMap::new(); for x in input.split('>').skip(1) { let mut iter = x.split(char::is_whitespace); let name = iter.next().unwrap().to_string(); let dna = iter.collect::<String>(); result.insert(name, dna); } result } pub fn write_file(abbr: &str, output: &str) { let name = "Output/rosalind_".to_string() + abbr + ".txt"; File::create(name) .unwrap() .write(output.as_bytes()) .unwrap(); }
string_algorithms.rs:
use utils; #[allow(dead_code)] pub fn dna() { // Read DNA string from file and count A, C, G, T characters. // Is there a way to do this without the Counter class? let abbr = "dna"; let input = utils::read_file(abbr); struct Counter { a: u32, c: u32, g: u32, t: u32, } impl Counter { fn new() -> Counter { Counter{ a: 0u32, c: 0u32, g: 0u32, t: 0u32, } } } let count = input.chars().fold(Counter::new(), |mut total, ch| { total.a += (ch == 'A') as u32; total.c += (ch == 'C') as u32; total.g += (ch == 'G') as u32; total.t += (ch == 'T') as u32; total }); let output = format!("{} {} {} {}", count.a, count.c, count.g, count.t); println!("{}", output); utils::write_file(abbr, &output); } #[allow(dead_code)] pub fn rna() { // Read DNA string from file and replace all T characters with U. // (Easy enough...) let abbr = "rna"; let input = utils::read_file(abbr); let output = input.replace("T", "U"); println!("{}", output); utils::write_file(abbr, &output); } #[allow(dead_code)] pub fn revc() { // Read DNA string from file, reverse it, then swap A with T and C with G. let abbr = "revc"; let input = utils::read_file(abbr); let output : String = input.chars().rev().map(|mut ch| { if ch == 'A' { ch = 'T'; } else if ch == 'T' { ch = 'A'; } else if ch == 'C' { ch = 'G'; } else if ch == 'G' { ch = 'C'; } ch }).collect(); println!("{}", output); utils::write_file(abbr, &output); } #[allow(dead_code)] pub fn gc() { // Read Name / DNA String pairs from file... // Find string with highest percentage of C and G. let abbr = "gc"; let input = utils::read_fasta(abbr); let mut max = ("", 0f32); for (k, v) in input.iter() { let gc_count = v.chars().filter(|&ch| ch == 'G' || ch == 'C').count(); let gc_percent = 100f32 * gc_count as f32 / v.len() as f32; if gc_percent > max.1 { max = (&k, gc_percent); } } let output = format!("{} {}", max.0, max.1); println!("{}", output); utils::write_file(abbr, &output); } use std::char; #[allow(dead_code)] pub fn subs() { // Read string and substring from file. // Count the number of occurrences of the substring in the whole string // (including overlapping matches!). let abbr = "subs"; let input = utils::read_file(abbr); // Extract whole and substring from file format: "wholestring\nsubstring" let mut iter = input.split(char::is_whitespace); let whole = iter.next().unwrap().to_string(); let sub = iter.collect::<String>(); // Why doesn't next.unwrap().to_string() work here too? assert!(!whole.is_empty()); assert!(!sub.is_empty()); assert!(whole.len() >= sub.len()); let mut positions = Vec::<usize>::new(); for i in 0..((whole.len() - sub.len()) + 1) { let m = whole.chars().skip(i) .zip(sub.chars()) .all(|(w, s)| w == s); if m { positions.push(i + 1); } } let output = positions.iter() .map(|p| p.to_string()) .collect::<Vec<_>>() .join(" "); println!("{}", output); utils::write_file(abbr, &output); } pub fn lcsm() { // Read Name / DNA string pairs from file. // Find the longest substring present in all the strings. // Work through all substrings of the first string (starting with the longest), // and check if it's present in all the other strings too. // (Could sort the strings so the shortest is first, // but they're all about the same length so it makes no difference...) let abbr = "lcsm"; let input = utils::read_fasta(abbr); let first: &String = input.values().next().unwrap(); for length in (1..(first.len() + 1)).rev() { let mut start = 0; loop { let end = start + length; // BAD: This copies the string... how to avoid this? let sub = first.chars().skip(start).take(length).collect::<String>(); if input.values().skip(1).all(|x| x.contains(&sub)) { println!("{}", sub); utils::write_file(abbr, &sub); return; } if end == first.len() { break; } start += 1; } } assert!(false); }