main.rs 6.37 KB
mod earley_parse;
mod cfg;
mod types;
mod nfa_array;
mod boyer_moore;
mod threadpool;

use crate::cfg::parse_regex;
use crate::nfa_array::{Simulation, create_simulation};
use crate::boyer_moore::BM;
use crate::threadpool::Threadpool;

use std::env;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;
use std::str::FromStr;
use std::cmp;
use std::sync::{Arc, Mutex};

type Match = (usize, Vec<String>);

fn main() {
    let args: Vec<String> = env::args().collect();
    let regex = &args[1];
    let filename = &args[2];
    let thread_n: usize = FromStr::from_str(&args[3]).unwrap();
    let result= parse_regex(regex);

    match result {
        | None => println!("The string provided is not a regular expression"),
        | Some(ast) => 
            {
                let new_ast = ast.collapse();
                let mut sim = create_simulation(&new_ast);
                let prefix = sim.enable_prefix();
                let has_prefix = prefix != "";
                // let mut line_n = 0;
                let all_matches: Arc<Mutex<Vec<Match>>> = Arc::new(Mutex::new(Vec::new()));

                if let Ok(lines) = read_lines(filename) {
                    let pool = Threadpool::new(thread_n);
                    let mut bm;
                    if has_prefix {
                        bm = BM::new(prefix);
                    }
                    else {
                        bm = BM::new(String::from(""));
                    }
                    for (line_n, line) in lines.flatten().enumerate() {
                        let mut sim_thread = sim.clone();
                        let line_vec: Vec<char> = line.chars().collect();
                        let all_matches = Arc::clone(&all_matches);
                        if has_prefix {
                            let mut bm_thread = bm.clone();
                            pool.execute(move || {
                                let results = check_line_with_prefix(&mut sim_thread, &line_vec, line_n, &mut bm_thread);
                                match results {
                                    None => (),
                                    Some(m) => {
                                        let mut matches_arr = all_matches.lock().unwrap();
                                        matches_arr.push((line_n, m));
                                    }
                                }
                            });
                        }
                        else {
                            pool.execute(move || {
                                let results = check_line(&mut sim_thread, &line_vec, line_n);
                                match results {
                                    None => (),
                                    Some(m) => {
                                        let mut matches_arr = all_matches.lock().unwrap();
                                        matches_arr.push((line_n, m));
                                    }
                                }
                            })
                        }
                    }
                }
                all_matches.lock().unwrap().sort_by(|a, b| compare_match(a, b));
                for (line_n, line_matches) in &*all_matches.lock().unwrap() {
                    for m in line_matches {
                        println!("{line_n}:{m}");
                    }
                }
            },
    };
}

fn check_to_end_prefix(sim: &mut Simulation, lm: &mut usize, 
    i: usize, line: &Vec<char>, line_len: usize) {
    // finds the largest string from i to line_len that matches
    let mut j = i;
    while !sim.stuck() & (j < line_len) {
        let c = line[j];
        sim.step(c);
        if sim.accepts() {
            *lm = j;
        }
        j += 1;
    }
}

fn check_to_end(sim: &mut Simulation, lm: &mut usize, 
    i: usize, line: &Vec<char>, line_len: usize) {
    // finds the largest string from i to line_len that matches
    let mut j = i;
    while !sim.stuck() & (j < line_len) {
        let c = line[j];
        sim.step(c);
        if sim.accepts() {
            *lm = j + 1;
        }
        j += 1;
    }
}

fn check_line(sim: &mut Simulation, line: &Vec<char>, line_n: usize) -> Option<Vec<String>> {
    // finds matches in a line and prints out result
    // LINE:MATCH

    let mut lm = 0;
    let line_len = line.len();
    let mut i = 0;
    let mut matches: Vec<String> = Vec::new();
    while i < line_len {
        sim.reset();
        check_to_end(sim, &mut lm, i, &line, line_len);
        if lm > i {
            let m: String = line[i..lm].into_iter().collect();
            matches.push(m);
            // println!("{line_n}:{}", m);
            i = lm;
        }
        else {
            i += 1;
        }
    }
    if matches.is_empty() {
        None
    }
    else {
        Some(matches)
    }
}

fn check_line_with_prefix(sim: &mut Simulation, line: &Vec<char>, 
                          line_n: usize, bm: &mut BM) -> Option<Vec<String>> {
    // finds matches in a line and prints out result 
    // using Boyer-Moore Alrgoithm to find matching prefix
    // LINE:MATCH

    let mut lm = 0;
    let line_len = line.len();
    let mut i = 0;
    let mut matches: Vec<String> = Vec::new();
    while i < line_len {
        sim.reset();
        // prefix idx becomes idx to start search
        let mut prefix_idx = 0;
        let found_prefix = bm.find(&mut prefix_idx, &line[i..]);
        i = i + prefix_idx;
        if sim.accepts() {
            lm = i - 1;
        }
        if !found_prefix {
            // don't search if prefix not found in line
            break;
        }
        check_to_end_prefix(sim, &mut lm, i, &line, line_len);
        if lm >= i - 1{
            let m_start = i - bm.pattern_len();
            let m: String = line[m_start..=lm].into_iter().collect();
            matches.push(m);
            // println!("{line_n}:{}", m);
            i = lm + 1;
        }
        else {
            i += 1;
        }
    }
    if matches.is_empty() {
        None
    }
    else {
        Some(matches)
    }
}

fn compare_match((idx_a, _): & Match, (idx_b, _): & Match) -> cmp::Ordering {
    (*idx_a).cmp(idx_b)
}

// From Rust by Example Docs:
// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
where P: AsRef<Path>, {
    let file = File::open(filename)?;
    // bufreader for efficiency
    Ok(io::BufReader::new(file).lines())
}