all the extra stuff

c26edfd5 · Ricardo A Garcia · 6955c448 · c26edfd5 · c26edfd5 · c26edfd5
Commit c26edfd5 authored 1 year ago by Ricardo A Garcia
Hide whitespace changes
Inline Side-by-side

Showing

with 423 additions and 0 deletions
+423 -0
--- a/my_grep_versions/base_grep
+++ b/my_grep_versions/base_grep
--- a/my_grep_versions/base_grep_release
+++ b/my_grep_versions/base_grep_release
--- a/my_grep_versions/base_grep_release_vecs
+++ b/my_grep_versions/base_grep_release_vecs
--- a/my_grep_versions/cache_attempt1_release
+++ b/my_grep_versions/cache_attempt1_release
--- a/my_grep_versions/mostly_vecs_release
+++ b/my_grep_versions/mostly_vecs_release
--- a/scripts/ATCG_maker.py
+++ b/scripts/ATCG_maker.py
+import random
+import string
+import os
+
+def generate_dna_file(n_rows, m_columns, output_dir='script_results/dna_files'):
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Generate the random DNA sequence
+    dna_sequence = [''.join(random.choices('ATCG', k=m_columns)) for _ in range(n_rows)]
+
+    # Create the filename based on the number of rows and columns
+    filename = f'random_ATCG_{m_columns}x{n_rows}.txt'
+    filepath = os.path.join(output_dir, filename)
+
+    # Write the DNA sequence to the file
+    with open(filepath, 'w') as f:
+        for line in dna_sequence:
+            f.write(line + '\n')
+
+    print(f'File saved to: {filepath}')
+
+# Example usage
+if __name__ == "__main__":
+    n_rows = 2000000 # Number of rows
+    m_columns = 50 # Number of columns per row
+    generate_dna_file(n_rows, m_columns)
--- a/scripts/ATCG_maker.sh
+++ b/scripts/ATCG_maker.sh
+#!/bin/bash
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# echo "script_dir is $script_dir"
+# Check if both arguments are provided
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <line_length> <number_of_lines>"
+    exit 1
+fi
+
+# Assign input arguments to variables
+line_length=$1
+num_lines=$2
+
+# Generate random ATCG sequence
+random_sequence() {
+    # Define characters to choose from
+    chars="ATCG"
+
+    # Generate random sequence of length $1
+    seq=""
+    for ((i=0; i<$1; i++)); do
+        rand_index=$(( RANDOM % ${#chars} ))
+        seq+=${chars:$rand_index:1}
+    done
+
+    echo "$seq"
+}
+
+# Generate random ATCG file
+output_file="$script_dir/../script_results/dna_files/random_ATCG_${line_length}x${num_lines}.txt"
+touch "$output_file"
+for ((i=0; i<$num_lines; i++)); do
+    sequence=$(random_sequence $line_length)
+    echo "$sequence" >> "$output_file"
+done
+
+echo "Random ATCG file generated: $output_file"
--- a/scripts/compare_greps.sh
+++ b/scripts/compare_greps.sh
+#!/bin/bash
+
+# Check if the correct number of arguments is provided
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <sys_regex_pattern> <my_regex_pattern> <file_to_search>"
+    exit 1
+fi
+
+# Get the regex pattern and file to search from the arguments
+sys_regex_pattern=$1
+my_regex_pattern=$2
+file_to_search=$3
+
+# Check if the file exists
+if [ ! -f "$file_to_search" ]; then
+    echo "Error: File '$file_to_search' not found."
+    exit 1
+fi
+
+# Time the system grep implementation
+echo "Running system grep..."
+{ time ../ripgrep/target/release/rg -on "$sys_regex_pattern" "$file_to_search"; } 2> ./script_results/system_grep_time.txt > /dev/null #./script_results/system_grep_output.txt
+
+echo ""
+echo "System grep time:"
+cat ./script_results/system_grep_time.txt
+
+# Time the custom grep implementation
+echo "Running custom grep..."
+{ time ./target/debug/grep "$my_regex_pattern" "$file_to_search"; } 2> ./script_results/custom_grep_time.txt > /dev/null #./script_results/custom_grep_output.txt
+
+echo ""
+echo "Custom grep time:"
+cat ./script_results/custom_grep_time.txt
+# diff ./script_results/system_grep_output.txt ./script_results/custom_grep_output.txt
\ No newline at end of file
--- a/scripts/dna_time_tests.sh
+++ b/scripts/dna_time_tests.sh
+#!/bin/bash
+set -euxo pipefail
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# echo "script_dir is $script_dir"
+
+# Path to the directory containing files
+directory="$script_dir/../script_results/dna_files"
+script="$script_dir/time_a_grep.sh"
+
+# echo "directory is: $directory"
+# echo "script is: $script"
+
+# Additional arguments for the script
+arg1="CCCC((AC*G)|(TTT))"
+grep_v="$script_dir/../my_grep_versions/mostly_vecs_release"
+
+echo "------------------------------" >> $script_dir/../script_results/dna_time_tests2.txt
+echo "$grep_v" >> $script_dir/../script_results/dna_time_tests2.txt
+echo "------------------------------" >> $script_dir/../script_results/dna_time_tests2.txt
+
+# Iterate over each file in the directory
+find "$directory" -type f -exec bash -c '
+filename=$(basename $1); 
+echo "Executing script for file: $filename" >> "$4/../script_results/dna_time_tests2.txt";
+$0 $2 "$3" $1 >> "$4/../script_results/dna_time_tests2.txt";
+echo ""' "$script" {} "$grep_v" "$arg1" "$script_dir" \;
+#  > "$script_dir/../script_results/dna_time_tests.txt"
\ No newline at end of file
--- a/scripts/make_random_txt.sh
+++ b/scripts/make_random_txt.sh
+#!/bin/bash
+
+# Check if the correct number of arguments is provided
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <filename> <number_of_characters>"
+    exit 1
+fi
+
+# Get the filename and number of characters from the arguments
+filename=$1
+num_chars=$2
+
+# Generate the random letters and save to the file
+base_letters=("A" "T" "C" "G")
+count=0
+
+> "$filename" # Clear the file if it already exists
+
+for ((i=0; i<num_chars; i++)); do
+    random_letter=${base_letters[$RANDOM % 4]}
+    echo -n "$random_letter" >> "$filename"
+    count=$((count + 1))
+    if [ $count -eq 50 ]; then
+        echo "" >> "$filename"
+        count=0
+    fi
+done
+
+# If the last line is not empty, add a newline at the end of the file
+if [ $count -ne 0 ]; then
+    echo "" >> "$filename"
+fi
+
+echo "Generated $num_chars random letters and saved to $filename"
--- a/scripts/plot_data.py
+++ b/scripts/plot_data.py
+import numpy as np
+import matplotlib.pyplot as plt
+
+def parse_file(filename):
+    with open(filename, 'r') as file:
+        lines = []
+        for line in file:
+            lines.append(line.strip().split("/")[-1])
+    
+    results = {}
+    current_title = None
+    for i in range(len(lines)):
+        if lines[i].startswith("-") and i+2 < len(lines) and lines[i+2].startswith("-"):
+            print(lines[i+1])
+            results[lines[i+1]] = (lines[i+4:i+14:2],lines[i+16:i+26:2][::-1])
+    if current_title is not None:
+        results[current_title] = lines  # Include the last set of lines
+    return results
+
+# Usage
+filename = "script_results/dna_time_tests2.txt"
+data = parse_file(filename)
+print(data)
+
+X = 100*(np.arange(5)**10)
+
+for k in data:
+    print(k)
+    print(len(data[k][0]))
+    print(data[k][0][0].split('s')[0].split('(')[1])
+    plt.plot(X, [float(d.split('s')[0].split('(')[1]) for d in data[k][1]][::-1], label=k)
+
+plt.xlabel('File Size (chars)')
+plt.ylabel('Running time (s)')
+plt.title('Average Time Complexity of Grep Implementations (Multi-Line)')
+plt.legend()
+plt.grid(True)
+plt.yscale('log')
+plt.xscale('log')
+plt.savefig('mul_ml2.png')
+plt.show()
\ No newline at end of file
--- a/scripts/plot_dna_test_output.py
+++ b/scripts/plot_dna_test_output.py
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+
+def parse_data(filename):
+    system_times_x1 = []
+    custom_times_x1 = []
+    system_times_50x = []
+    custom_times_50x = []
+    system_std_x1 = []
+    custom_std_x1 = []
+    system_std_50x = []
+    custom_std_50x = []
+
+
+    with open(filename, 'r') as file:
+        lines = file.readlines()
+
+        for i in range(len(lines)):
+            line = lines[i]
+            # print("line #",i,": ", line)
+            # print("?:", re.findall(r'Mean: ([\d.]+)s', lines[i+3]))
+            if 'random_ATCG_' in line:
+                if 'x1.txt' in line:
+                    system_times_x1.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+3])[0]))
+                    custom_times_x1.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+7])[0]))
+                    system_std_x1.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+4])[0]))
+                    custom_std_x1.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+8])[0]))
+                elif '50x' in line:
+                    system_times_50x.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+3])[0]))
+                    custom_times_50x.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+7])[0]))
+                    system_std_50x.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+4])[0]))
+                    custom_std_50x.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+8])[0]))
+
+    # print("did we get here?")
+    return system_times_x1[::-1], custom_times_x1[::-1], system_times_50x, custom_times_50x, system_std_x1[::-1], custom_std_x1[::-1], system_std_50x, custom_std_50x
+
+def plot_times(system_times, custom_times, system_std, custom_std, title, filename):
+    X = 1000*(10**np.arange(len(system_times)))
+    plt.errorbar(X,system_times,yerr=system_std, label='ripgrep')
+    plt.errorbar(X,custom_times,yerr=custom_std, label='my grep')
+    plt.xlabel('File Size (chars)')
+    plt.ylabel('Running time (s)')
+    plt.title(title)
+    plt.legend()
+    plt.grid(True)
+    plt.yscale('log')
+    plt.xscale('log')
+    plt.savefig(filename)  # Save the plot as a PNG file
+    plt.close()
+
+if __name__ == "__main__":
+    # must be ran from powershell bash :/
+    filename = r'script_results/dna_time_tests.txt'
+    system_times_x1, custom_times_x1, system_times_50x, custom_times_50x, system_std_x1, custom_std_x1, system_std_50x, custom_std_50x  = parse_data(filename)
+    # print("what about here?")
+    # print(system_times_x1)
+    plot_times(system_times_x1, custom_times_x1, system_std_x1, custom_std_x1, 'Running times for __x1 case', 'times_x1.png')
+    # plt.plot(system_times_x1)
+    # plt.savefig('wtf')
+    plot_times(system_times_50x, custom_times_50x, system_std_50x, custom_std_50x, 'Running times for 50x__ case', 'times_50x.png')
+    # print("?")
\ No newline at end of file
--- a/scripts/time_a_grep.sh
+++ b/scripts/time_a_grep.sh
+#!/bin/bash
+set -euxo pipefail
+
+# Check if the correct number of arguments is provided
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <grep_implementation> <regex_pattern> <file_to_search>"
+    exit 1
+fi
+
+# Get the regex pattern and file to search from the arguments
+regex_pattern=$2
+file_to_search=$3
+grep_implementation=$1
+
+# Check if the file exists
+if [ ! -f "$file_to_search" ]; then
+    echo "Error: File '$file_to_search' not found."
+    exit 1
+fi
+
+# Check if the file exists
+if [ ! -f "$grep_implementation" ]; then
+    echo "Error: File '$grep_implementation' not found."
+    exit 1
+fi
+
+# Initialize arrays to store times
+grep_times=()
+
+# Run both grep implementations 10 times and store times
+for ((i=1; i<=10; i++)); do
+    # Time the system grep implementation #../ripgrep/target/release/rg
+    grep_time=$( { time $grep_implementation "$regex_pattern" "$file_to_search" > /dev/null; } 2>&1 | grep real | awk '{print $2}')
+    grep_times+=("$grep_time")
+done
+
+# Function to calculate mean
+calculate_mean() {
+    local total=0
+    for time in "${@}"; do
+        local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
+        total=$(echo "$total + $processed" | bc)
+    done
+    echo "scale=4; $total / $#" | bc
+}
+
+# Function to calculate standard deviation
+calculate_std_dev() {
+    local mean=$1
+    local total=0
+    for time in "${@:2}"; do
+        local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
+        # echo "processed is $processed"
+        # echo "mean is $mean"
+        # echo "diff is"
+        # echo "$processed - $mean" | bc
+        # echo ""
+        # echo "scale=10; 0.0013^2" | bc
+        # echo "square diff is "
+        # echo "scale=4; ($processed - $mean)^2" | bc
+        # echo ""
+        total=$(echo "scale=10; $total + ($processed - $mean)^2" | bc)
+    done
+    echo "scale=10; sqrt($total / ($# - 1))" | bc
+}
+
+# Calculate mean and standard deviation for system grep times
+mean=$(calculate_mean "${grep_times[@]}")
+std=$(calculate_std_dev "$mean" "${grep_times[@]}")
+
+# Output the results
+echo "(${mean}s,${std}s)"
--- a/scripts/time_greps_stats.sh
+++ b/scripts/time_greps_stats.sh
+#!/bin/bash
+set -euxo pipefail
+
+# Check if the correct number of arguments is provided
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <sys_regex_pattern> <my_regex_pattern> <file_to_search>"
+    exit 1
+fi
+
+# Get the regex pattern and file to search from the arguments
+sys_regex_pattern=$1
+my_regex_pattern=$2
+file_to_search=$3
+
+# Check if the file exists
+if [ ! -f "$file_to_search" ]; then
+    echo "Error: File '$file_to_search' not found."
+    exit 1
+fi
+
+# Initialize arrays to store times
+system_grep_times=()
+custom_grep_times=()
+
+# Run both grep implementations 10 times and store times
+for ((i=1; i<=10; i++)); do
+    # Time the system grep implementation #../ripgrep/target/release/rg
+    system_grep_time=$( { time ./my_grep_versions/mostly_vecs_release "$my_regex_pattern" "$file_to_search" > /dev/null; } 2>&1 | grep real | awk '{print $2}')
+    system_grep_times+=("$system_grep_time")
+    # echo "sys:$system_grep_time"
+
+    # Time the custom grep implementation
+    custom_grep_time=$( { time ./target/release/grep "$my_regex_pattern" "$file_to_search" > /dev/null; } 2>&1 | grep real | awk '{print $2}')
+    custom_grep_times+=("$custom_grep_time")
+    # echo "cus:$custom_grep_time"
+done
+
+# Function to calculate mean
+calculate_mean() {
+    local total=0
+    for time in "${@}"; do
+        local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
+        total=$(echo "$total + $processed" | bc)
+    done
+    echo "scale=4; $total / $#" | bc
+}
+
+# Function to calculate standard deviation
+calculate_std_dev() {
+    local mean=$1
+    local total=0
+    for time in "${@:2}"; do
+        local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
+        # echo "processed is $processed"
+        # echo "mean is $mean"
+        # echo "diff is"
+        # echo "$processed - $mean" | bc
+        # echo ""
+        # echo "scale=10; 0.0013^2" | bc
+        # echo "square diff is "
+        # echo "scale=4; ($processed - $mean)^2" | bc
+        # echo ""
+        total=$(echo "scale=10; $total + ($processed - $mean)^2" | bc)
+    done
+    echo "scale=10; sqrt($total / ($# - 1))" | bc
+}
+
+# Calculate mean and standard deviation for system grep times
+system_grep_mean=$(calculate_mean "${system_grep_times[@]}")
+system_grep_std=$(calculate_std_dev "$system_grep_mean" "${system_grep_times[@]}")
+
+# Calculate mean and standard deviation for custom grep times
+custom_grep_mean=$(calculate_mean "${custom_grep_times[@]}")
+custom_grep_std=$(calculate_std_dev "$custom_grep_mean" "${custom_grep_times[@]}")
+
+# Output the results
+echo ""
+echo "System grep times:"
+# printf '%s\n' "${system_grep_times[@]}"
+echo "Mean: ${system_grep_mean}s"
+echo "Standard deviation: ${system_grep_std}s"
+echo ""
+echo "Custom grep times:"
+# printf '%s\n' "${custom_grep_times[@]}"
+echo "Mean: ${custom_grep_mean}s"
+echo "Standard deviation: ${custom_grep_std}s"
+