Rosalind string algorithm problems

Question

I've been starting to learn Rust by going through some of the Rosalind String Algorithm problems.

If anyone would like to point out possible improvements, or anything else, that would be great. There are a couple of things I'm especially curious about noted in the subs and lcsm solutions.

main.rs:

mod utils; mod string_algorithms; fn main() { //string_algorithms::dna(); //string_algorithms::rna(); //string_algorithms::revc(); //string_algorithms::gc(); //string_algorithms::subs(); string_algorithms::lcsm(); println!("done!"); }

utils.rs:

use std::io::Read; use std::io::Write; use std::fs::File; pub fn read_file(abbr: &str) -> String { let name = "Input/rosalind_".to_string() + abbr + ".txt"; let mut input = String::new(); File::open(name) .unwrap() .read_to_string(&mut input) .unwrap(); input } use std::collections::BTreeMap; use std::char; #[allow(dead_code)] pub fn read_fasta(abbr: &str) -> BTreeMap<String, String> { let input = read_file(abbr); // read format: // >NAME\n // STRBLAHBLAHBLAH\n // STILLTHESAMESTR\n // >NAME\n // ... let mut result = BTreeMap::new(); for x in input.split('>').skip(1) { let mut iter = x.split(char::is_whitespace); let name = iter.next().unwrap().to_string(); let dna = iter.collect::<String>(); result.insert(name, dna); } result } pub fn write_file(abbr: &str, output: &str) { let name = "Output/rosalind_".to_string() + abbr + ".txt"; File::create(name) .unwrap() .write(output.as_bytes()) .unwrap(); }

string_algorithms.rs:

use utils; #[allow(dead_code)] pub fn dna() { // Read DNA string from file and count A, C, G, T characters. // Is there a way to do this without the Counter class? let abbr = "dna"; let input = utils::read_file(abbr); struct Counter { a: u32, c: u32, g: u32, t: u32, } impl Counter { fn new() -> Counter { Counter{ a: 0u32, c: 0u32, g: 0u32, t: 0u32, } } } let count = input.chars().fold(Counter::new(), |mut total, ch| { total.a += (ch == 'A') as u32; total.c += (ch == 'C') as u32; total.g += (ch == 'G') as u32; total.t += (ch == 'T') as u32; total }); let output = format!("{} {} {} {}", count.a, count.c, count.g, count.t); println!("{}", output); utils::write_file(abbr, &output); } #[allow(dead_code)] pub fn rna() { // Read DNA string from file and replace all T characters with U. // (Easy enough...) let abbr = "rna"; let input = utils::read_file(abbr); let output = input.replace("T", "U"); println!("{}", output); utils::write_file(abbr, &output); } #[allow(dead_code)] pub fn revc() { // Read DNA string from file, reverse it, then swap A with T and C with G. let abbr = "revc"; let input = utils::read_file(abbr); let output : String = input.chars().rev().map(|mut ch| { if ch == 'A' { ch = 'T'; } else if ch == 'T' { ch = 'A'; } else if ch == 'C' { ch = 'G'; } else if ch == 'G' { ch = 'C'; } ch }).collect(); println!("{}", output); utils::write_file(abbr, &output); } #[allow(dead_code)] pub fn gc() { // Read Name / DNA String pairs from file... // Find string with highest percentage of C and G. let abbr = "gc"; let input = utils::read_fasta(abbr); let mut max = ("", 0f32); for (k, v) in input.iter() { let gc_count = v.chars().filter(|&ch| ch == 'G' || ch == 'C').count(); let gc_percent = 100f32 * gc_count as f32 / v.len() as f32; if gc_percent > max.1 { max = (&k, gc_percent); } } let output = format!("{} {}", max.0, max.1); println!("{}", output); utils::write_file(abbr, &output); } use std::char; #[allow(dead_code)] pub fn subs() { // Read string and substring from file. // Count the number of occurrences of the substring in the whole string // (including overlapping matches!). let abbr = "subs"; let input = utils::read_file(abbr); // Extract whole and substring from file format: "wholestring\nsubstring" let mut iter = input.split(char::is_whitespace); let whole = iter.next().unwrap().to_string(); let sub = iter.collect::<String>(); // Why doesn't next.unwrap().to_string() work here too? assert!(!whole.is_empty()); assert!(!sub.is_empty()); assert!(whole.len() >= sub.len()); let mut positions = Vec::<usize>::new(); for i in 0..((whole.len() - sub.len()) + 1) { let m = whole.chars().skip(i) .zip(sub.chars()) .all(|(w, s)| w == s); if m { positions.push(i + 1); } } let output = positions.iter() .map(|p| p.to_string()) .collect::<Vec<_>>() .join(" "); println!("{}", output); utils::write_file(abbr, &output); } pub fn lcsm() { // Read Name / DNA string pairs from file. // Find the longest substring present in all the strings. // Work through all substrings of the first string (starting with the longest), // and check if it's present in all the other strings too. // (Could sort the strings so the shortest is first, // but they're all about the same length so it makes no difference...) let abbr = "lcsm"; let input = utils::read_fasta(abbr); let first: &String = input.values().next().unwrap(); for length in (1..(first.len() + 1)).rev() { let mut start = 0; loop { let end = start + length; // BAD: This copies the string... how to avoid this? let sub = first.chars().skip(start).take(length).collect::<String>(); if input.values().skip(1).all(|x| x.contains(&sub)) { println!("{}", sub); utils::write_file(abbr, &sub); return; } if end == first.len() { break; } start += 1; } } assert!(false); }

I have rolled back the last edit. Please see what you may and may not do after receiving answers. — Vogel612, CommentedOct 7, 2015 at 13:12
@Vogel612 My bad. I've added it as an answer instead, I hope that's ok. — user673679, CommentedOct 7, 2015 at 13:21

MAG · Accepted Answer · 2015-10-07 06:12:15Z

I am new to rust as well it may not be the best alternative but I got a small suggestion that I think its better than what you got.

In this particular part:

let output : String = input.chars().rev().map(|mut ch| { if ch == 'A' { ch = 'T'; } else if ch == 'T' { ch = 'A'; } else if ch == 'C' { ch = 'G'; } else if ch == 'G' { ch = 'C'; } ch }).collect();

you could change it to this (taking advantage that almost everything is an expression):

let output : String = input.chars().rev().map(|ch| { match ch { 'A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C', _ => ch } }).collect();

Thanks, that's certainly an improvement. I've also changed the dna() function to use match when counting characters. — user673679, CommentedOct 7, 2015 at 13:06

user673679 · Accepted Answer · 2015-10-07 13:20:52Z

Using MAG's suggestion, I've also improved the "dna" solution like so:

let (mut a, mut c, mut g, mut t) = (0u32, 0u32, 0u32, 0u32); for ch in input.chars() { match ch { 'A' => a += 1, 'C' => c += 1, 'G' => g += 1, 'T' => t += t, _ => (), }; }

Stack Exchange Network

Rosalind string algorithm problems

2 Answers 2

Hot Network Questions

Rosalind string algorithm problems

2 Answers 2

Related

Hot Network Questions