boyer_moore.rs 8.15 KB
const LAST: usize = 0x7E - 0x20;
const FIRST: usize = 0x20;
const NULL: usize = LAST + 1;

#[derive(Clone)]
pub struct BM {
    last_t: Vec<Vec<usize>>,
    pattern: Vec<char>,
    pattern_idx: usize,
    pattern_len: usize,
    line_idx: usize,
    line_len: usize,
}


pub fn to_idx(c: char) -> usize {
    if c == '\t' {
        LAST
    }
    else if (c >=' ') & (c <= '~'){
        (c as usize) - FIRST
    }
    else {
        NULL
    }
}

impl BM {
    pub fn new(pattern_str: String) -> Self {
        let pattern: Vec<char> = pattern_str.chars().collect();
        let l = pattern.len();
        if l == 0 {
            BM {last_t: Vec::new(), 
                pattern: pattern, pattern_idx: 0, pattern_len: 0,
                line_idx: 0, line_len: 0
                }
        }
        else {
                let mut  new = BM {last_t: Vec::new(), 
                pattern: pattern, pattern_idx: *&l-1, pattern_len: l,
                line_idx: 0, line_len: 0
                };
            new.fill_last_table();
            new
        }

    }

    pub fn fill_last_table(&mut self) {
        // creates a table where t[i, j] is the last occurance
        // of char i to the left of j on the pattern

        // list of last idx for each char as we traverse the pattern
        let mut last: Vec<usize> = Vec::new();
        for _ in ' '..='~' {
            self.last_t.push(Vec::new());
            last.push(NULL);
        }
        // for horizontal tab
        self.last_t.push(Vec::new());
        last.push(NULL);

        for (i, c) in self.pattern.iter().enumerate() {
            for a in ' '..='~' {
                self.last_t[to_idx(a)].push(last[to_idx(a)]);
                if a == *c {
                    last[to_idx(a)] = i;
                }
            }
            self.last_t[to_idx('\t')].push(last[to_idx('\t')]);
            if '\t' == *c {
                last[to_idx('\t')] = i;
            }
        }
    }

    pub fn bad_compare(&mut self, match_idx: &mut usize, line: &[char]) -> Option<bool> {
        if self.line_idx >= self.line_len {
            return None
        }
        if line[self.line_idx] == self.pattern[self.pattern_idx] {
            if self.pattern_idx == 0 {
                // match
                *match_idx = self.line_idx + self.pattern_len;
                Some(true)
            }
            else {
                self.pattern_idx -= 1;
                self.line_idx -= 1;
                Some(false)
            }
        }
        else {
            let line_char_idx = to_idx(line[self.line_idx]);
            if line_char_idx == NULL {
                // println!("Chastized char: {}", line[self.line_idx]);
                self.line_idx = self.line_idx + self.pattern_len;
                self.pattern_idx = self.pattern_len - 1;

                if self.line_idx >= self.line_len {
                    // searched the entire line
                    return None;
                }
                else {
                    return Some(false);
                }
            }
            if self.last_t[line_char_idx][self.pattern_idx] == NULL {
                // there is no matching char to the left
                self.line_idx = self.line_idx + self.pattern_len;
                self.pattern_idx = self.pattern_len - 1;

                if self.line_idx >= self.line_len {
                    // searched the entire line
                    None
                }
                else {
                    Some(false)
                }
            }
            else {
                // shift pattern to align next occurance of line char
                self.line_idx = self.line_idx 
                                + self.pattern_len - 1 
                                - self.last_t[line_char_idx][self.pattern_idx];
                self.pattern_idx = self.pattern_len - 1;
                Some(false)
            }
            
        }
    }

    pub fn find(&mut self, match_idx: &mut usize, line: &[char]) -> bool{
        if self.pattern_len > 0 {
            let l = line.len();
            // self.line = line;
            self.line_len = l;
            self.line_idx = self.pattern_len - 1;

            // set pattern to be at last char
            self.pattern_idx = self.pattern_len - 1;
            while let Some(found_pattern) = self.bad_compare(match_idx, &line) {
                if found_pattern {
                    return true;
                };
            }
            false
        } 
        else { 
            panic!("empty prefix");
        }
    }

    pub fn pattern_len(&self) -> usize {
        self.pattern_len
    }

    pub fn show_table(&self, char_lst: Vec<char>) {
        for c in char_lst {
            let a_row = &self.last_t[to_idx(c)];
            print!("{}:", c);
            for c in a_row {
                print!(" {}", c);
            }
            println!("");
        }
    }
}

pub fn z_algorithm(input: &Vec<char>) -> Vec<usize> {
    let mut z_arr = Vec::new();
    z_arr.push(0);

    let mut r = 0;
    let mut l = 0;
    let len = input.len();

    for idx in 1..len {
        if idx > r {
            // loop until mismatch
            let mut z = 0;
            while input[z] == input[idx + z] {
                z += 1;

                if idx + z >= len {
                    break;
                }
            }
            if z > 0 {
                // update box boundaries
                r = idx + z - 1;
                l = idx;
            }
            z_arr.push(z);
        }
        else {
            // inside box
            let prefix_z = z_arr[idx-l];
            let box_left = r - idx + 1;
            if  prefix_z < box_left {
                z_arr.push(prefix_z);
            }
            else {
                // match outside box
                let mut z = 1;
                if r + z < len {
                    while input[box_left-1+z] == input[r+z] {
                        z += 1;
                        if r + z >= len {
                            break;
                        }
                    }
                }
                if z > 1 {
                    // new match plus size of previous
                    z_arr.push(z+box_left-1);
                    r += z - 1;
                    l = idx;
                }
                else {
                    // same as previous
                    z_arr.push(box_left);
                }
                
            }
        }
        // println!("{idx}: ({l}, {r}) -> {:?}", z_arr);
    }
    z_arr
}

pub fn suffix_t(input: &Vec<char>) -> Vec<usize> {
    let len = input.len();
    let mut arr: Vec<usize> = Vec::new();
    for _ in 0..len {
        arr.push(0);
    }
    let mut reverse_input = input.clone();
    reverse_input.reverse();
    let mut longest_suffix = z_algorithm(&reverse_input);
    longest_suffix.reverse();

    for (j, z) in longest_suffix.iter().enumerate() {
        let i = len - z;
        if i != len {
            arr[i] = j;
        }
    }
    arr
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn bad_char_fill() {
        let pattern = String::from("aba");
        let mut bm = BM::new(pattern);
        let mut b = 0;
        let str_vec: Vec<char> = String::from("abababa").chars().collect();
        if bm.find(&mut b, &str_vec[..]) {println!("Found it at {b}");}
    }

    #[test]
    fn z_algo() {
        let input: Vec<char> = String::from("aabcaabxaaaz").chars().collect();
        let z_arr = z_algorithm(&input);
        assert_eq!(z_arr, vec![0, 1, 0, 0, 3, 1, 0, 0, 2, 2, 1, 0]);

        let input2: Vec<char> = String::from("abababaabab").chars().collect();
        let z_arr2 = z_algorithm(&input2);
        assert_eq!(z_arr2, vec![0, 0, 5, 0, 3, 0, 1, 4, 0, 2, 0]);

        let input3: Vec<char> = String::from("aaaaa").chars().collect();
        let z_arr3 = z_algorithm(&input3);
        assert_eq!(z_arr3, vec![0, 4, 3, 2, 1]);

        let input4: Vec<char> = String::from("aabaabxaaz").chars().collect();
        let z_arr4 = z_algorithm(&input4);
        assert_eq!(z_arr4, vec![0, 1, 0, 3, 1, 0, 0, 2, 1, 0]);
        // println!("{:?}", z_arr2);

        let input5: Vec<char> = String::from("pnampnam").chars().collect();
        let z_arr5 = suffix_t(&input5);
        println!("{:?}", z_arr5);
    }
}