Commit c26edfd5 authored by Ricardo A Garcia's avatar Ricardo A Garcia
Browse files

all the extra stuff

No related merge requests found
Pipeline #105748 failed with stage
in 0 seconds
Showing with 423 additions and 0 deletions
+423 -0
File added
File added
File added
File added
File added
import random
import string
import os
def generate_dna_file(n_rows, m_columns, output_dir='script_results/dna_files'):
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Generate the random DNA sequence
dna_sequence = [''.join(random.choices('ATCG', k=m_columns)) for _ in range(n_rows)]
# Create the filename based on the number of rows and columns
filename = f'random_ATCG_{m_columns}x{n_rows}.txt'
filepath = os.path.join(output_dir, filename)
# Write the DNA sequence to the file
with open(filepath, 'w') as f:
for line in dna_sequence:
f.write(line + '\n')
print(f'File saved to: {filepath}')
# Example usage
if __name__ == "__main__":
n_rows = 2000000 # Number of rows
m_columns = 50 # Number of columns per row
generate_dna_file(n_rows, m_columns)
#!/bin/bash
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# echo "script_dir is $script_dir"
# Check if both arguments are provided
if [ $# -ne 2 ]; then
echo "Usage: $0 <line_length> <number_of_lines>"
exit 1
fi
# Assign input arguments to variables
line_length=$1
num_lines=$2
# Generate random ATCG sequence
random_sequence() {
# Define characters to choose from
chars="ATCG"
# Generate random sequence of length $1
seq=""
for ((i=0; i<$1; i++)); do
rand_index=$(( RANDOM % ${#chars} ))
seq+=${chars:$rand_index:1}
done
echo "$seq"
}
# Generate random ATCG file
output_file="$script_dir/../script_results/dna_files/random_ATCG_${line_length}x${num_lines}.txt"
touch "$output_file"
for ((i=0; i<$num_lines; i++)); do
sequence=$(random_sequence $line_length)
echo "$sequence" >> "$output_file"
done
echo "Random ATCG file generated: $output_file"
#!/bin/bash
# Check if the correct number of arguments is provided
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <sys_regex_pattern> <my_regex_pattern> <file_to_search>"
exit 1
fi
# Get the regex pattern and file to search from the arguments
sys_regex_pattern=$1
my_regex_pattern=$2
file_to_search=$3
# Check if the file exists
if [ ! -f "$file_to_search" ]; then
echo "Error: File '$file_to_search' not found."
exit 1
fi
# Time the system grep implementation
echo "Running system grep..."
{ time ../ripgrep/target/release/rg -on "$sys_regex_pattern" "$file_to_search"; } 2> ./script_results/system_grep_time.txt > /dev/null #./script_results/system_grep_output.txt
echo ""
echo "System grep time:"
cat ./script_results/system_grep_time.txt
# Time the custom grep implementation
echo "Running custom grep..."
{ time ./target/debug/grep "$my_regex_pattern" "$file_to_search"; } 2> ./script_results/custom_grep_time.txt > /dev/null #./script_results/custom_grep_output.txt
echo ""
echo "Custom grep time:"
cat ./script_results/custom_grep_time.txt
# diff ./script_results/system_grep_output.txt ./script_results/custom_grep_output.txt
\ No newline at end of file
#!/bin/bash
set -euxo pipefail
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# echo "script_dir is $script_dir"
# Path to the directory containing files
directory="$script_dir/../script_results/dna_files"
script="$script_dir/time_a_grep.sh"
# echo "directory is: $directory"
# echo "script is: $script"
# Additional arguments for the script
arg1="CCCC((AC*G)|(TTT))"
grep_v="$script_dir/../my_grep_versions/mostly_vecs_release"
echo "------------------------------" >> $script_dir/../script_results/dna_time_tests2.txt
echo "$grep_v" >> $script_dir/../script_results/dna_time_tests2.txt
echo "------------------------------" >> $script_dir/../script_results/dna_time_tests2.txt
# Iterate over each file in the directory
find "$directory" -type f -exec bash -c '
filename=$(basename $1);
echo "Executing script for file: $filename" >> "$4/../script_results/dna_time_tests2.txt";
$0 $2 "$3" $1 >> "$4/../script_results/dna_time_tests2.txt";
echo ""' "$script" {} "$grep_v" "$arg1" "$script_dir" \;
# > "$script_dir/../script_results/dna_time_tests.txt"
\ No newline at end of file
#!/bin/bash
# Check if the correct number of arguments is provided
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <filename> <number_of_characters>"
exit 1
fi
# Get the filename and number of characters from the arguments
filename=$1
num_chars=$2
# Generate the random letters and save to the file
base_letters=("A" "T" "C" "G")
count=0
> "$filename" # Clear the file if it already exists
for ((i=0; i<num_chars; i++)); do
random_letter=${base_letters[$RANDOM % 4]}
echo -n "$random_letter" >> "$filename"
count=$((count + 1))
if [ $count -eq 50 ]; then
echo "" >> "$filename"
count=0
fi
done
# If the last line is not empty, add a newline at the end of the file
if [ $count -ne 0 ]; then
echo "" >> "$filename"
fi
echo "Generated $num_chars random letters and saved to $filename"
import numpy as np
import matplotlib.pyplot as plt
def parse_file(filename):
with open(filename, 'r') as file:
lines = []
for line in file:
lines.append(line.strip().split("/")[-1])
results = {}
current_title = None
for i in range(len(lines)):
if lines[i].startswith("-") and i+2 < len(lines) and lines[i+2].startswith("-"):
print(lines[i+1])
results[lines[i+1]] = (lines[i+4:i+14:2],lines[i+16:i+26:2][::-1])
if current_title is not None:
results[current_title] = lines # Include the last set of lines
return results
# Usage
filename = "script_results/dna_time_tests2.txt"
data = parse_file(filename)
print(data)
X = 100*(np.arange(5)**10)
for k in data:
print(k)
print(len(data[k][0]))
print(data[k][0][0].split('s')[0].split('(')[1])
plt.plot(X, [float(d.split('s')[0].split('(')[1]) for d in data[k][1]][::-1], label=k)
plt.xlabel('File Size (chars)')
plt.ylabel('Running time (s)')
plt.title('Average Time Complexity of Grep Implementations (Multi-Line)')
plt.legend()
plt.grid(True)
plt.yscale('log')
plt.xscale('log')
plt.savefig('mul_ml2.png')
plt.show()
\ No newline at end of file
import re
import matplotlib.pyplot as plt
import numpy as np
def parse_data(filename):
system_times_x1 = []
custom_times_x1 = []
system_times_50x = []
custom_times_50x = []
system_std_x1 = []
custom_std_x1 = []
system_std_50x = []
custom_std_50x = []
with open(filename, 'r') as file:
lines = file.readlines()
for i in range(len(lines)):
line = lines[i]
# print("line #",i,": ", line)
# print("?:", re.findall(r'Mean: ([\d.]+)s', lines[i+3]))
if 'random_ATCG_' in line:
if 'x1.txt' in line:
system_times_x1.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+3])[0]))
custom_times_x1.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+7])[0]))
system_std_x1.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+4])[0]))
custom_std_x1.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+8])[0]))
elif '50x' in line:
system_times_50x.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+3])[0]))
custom_times_50x.append(float(re.findall(r'Mean: ([\d.]+)s', lines[i+7])[0]))
system_std_50x.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+4])[0]))
custom_std_50x.append(float(re.findall(r'Standard deviation: ([\d.]+)s', lines[i+8])[0]))
# print("did we get here?")
return system_times_x1[::-1], custom_times_x1[::-1], system_times_50x, custom_times_50x, system_std_x1[::-1], custom_std_x1[::-1], system_std_50x, custom_std_50x
def plot_times(system_times, custom_times, system_std, custom_std, title, filename):
X = 1000*(10**np.arange(len(system_times)))
plt.errorbar(X,system_times,yerr=system_std, label='ripgrep')
plt.errorbar(X,custom_times,yerr=custom_std, label='my grep')
plt.xlabel('File Size (chars)')
plt.ylabel('Running time (s)')
plt.title(title)
plt.legend()
plt.grid(True)
plt.yscale('log')
plt.xscale('log')
plt.savefig(filename) # Save the plot as a PNG file
plt.close()
if __name__ == "__main__":
# must be ran from powershell bash :/
filename = r'script_results/dna_time_tests.txt'
system_times_x1, custom_times_x1, system_times_50x, custom_times_50x, system_std_x1, custom_std_x1, system_std_50x, custom_std_50x = parse_data(filename)
# print("what about here?")
# print(system_times_x1)
plot_times(system_times_x1, custom_times_x1, system_std_x1, custom_std_x1, 'Running times for __x1 case', 'times_x1.png')
# plt.plot(system_times_x1)
# plt.savefig('wtf')
plot_times(system_times_50x, custom_times_50x, system_std_50x, custom_std_50x, 'Running times for 50x__ case', 'times_50x.png')
# print("?")
\ No newline at end of file
#!/bin/bash
set -euxo pipefail
# Check if the correct number of arguments is provided
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <grep_implementation> <regex_pattern> <file_to_search>"
exit 1
fi
# Get the regex pattern and file to search from the arguments
regex_pattern=$2
file_to_search=$3
grep_implementation=$1
# Check if the file exists
if [ ! -f "$file_to_search" ]; then
echo "Error: File '$file_to_search' not found."
exit 1
fi
# Check if the file exists
if [ ! -f "$grep_implementation" ]; then
echo "Error: File '$grep_implementation' not found."
exit 1
fi
# Initialize arrays to store times
grep_times=()
# Run both grep implementations 10 times and store times
for ((i=1; i<=10; i++)); do
# Time the system grep implementation #../ripgrep/target/release/rg
grep_time=$( { time $grep_implementation "$regex_pattern" "$file_to_search" > /dev/null; } 2>&1 | grep real | awk '{print $2}')
grep_times+=("$grep_time")
done
# Function to calculate mean
calculate_mean() {
local total=0
for time in "${@}"; do
local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
total=$(echo "$total + $processed" | bc)
done
echo "scale=4; $total / $#" | bc
}
# Function to calculate standard deviation
calculate_std_dev() {
local mean=$1
local total=0
for time in "${@:2}"; do
local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
# echo "processed is $processed"
# echo "mean is $mean"
# echo "diff is"
# echo "$processed - $mean" | bc
# echo ""
# echo "scale=10; 0.0013^2" | bc
# echo "square diff is "
# echo "scale=4; ($processed - $mean)^2" | bc
# echo ""
total=$(echo "scale=10; $total + ($processed - $mean)^2" | bc)
done
echo "scale=10; sqrt($total / ($# - 1))" | bc
}
# Calculate mean and standard deviation for system grep times
mean=$(calculate_mean "${grep_times[@]}")
std=$(calculate_std_dev "$mean" "${grep_times[@]}")
# Output the results
echo "(${mean}s,${std}s)"
#!/bin/bash
set -euxo pipefail
# Check if the correct number of arguments is provided
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <sys_regex_pattern> <my_regex_pattern> <file_to_search>"
exit 1
fi
# Get the regex pattern and file to search from the arguments
sys_regex_pattern=$1
my_regex_pattern=$2
file_to_search=$3
# Check if the file exists
if [ ! -f "$file_to_search" ]; then
echo "Error: File '$file_to_search' not found."
exit 1
fi
# Initialize arrays to store times
system_grep_times=()
custom_grep_times=()
# Run both grep implementations 10 times and store times
for ((i=1; i<=10; i++)); do
# Time the system grep implementation #../ripgrep/target/release/rg
system_grep_time=$( { time ./my_grep_versions/mostly_vecs_release "$my_regex_pattern" "$file_to_search" > /dev/null; } 2>&1 | grep real | awk '{print $2}')
system_grep_times+=("$system_grep_time")
# echo "sys:$system_grep_time"
# Time the custom grep implementation
custom_grep_time=$( { time ./target/release/grep "$my_regex_pattern" "$file_to_search" > /dev/null; } 2>&1 | grep real | awk '{print $2}')
custom_grep_times+=("$custom_grep_time")
# echo "cus:$custom_grep_time"
done
# Function to calculate mean
calculate_mean() {
local total=0
for time in "${@}"; do
local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
total=$(echo "$total + $processed" | bc)
done
echo "scale=4; $total / $#" | bc
}
# Function to calculate standard deviation
calculate_std_dev() {
local mean=$1
local total=0
for time in "${@:2}"; do
local processed=$(echo $time | awk -F 'm|s' '{print $1*60 +$2}')
# echo "processed is $processed"
# echo "mean is $mean"
# echo "diff is"
# echo "$processed - $mean" | bc
# echo ""
# echo "scale=10; 0.0013^2" | bc
# echo "square diff is "
# echo "scale=4; ($processed - $mean)^2" | bc
# echo ""
total=$(echo "scale=10; $total + ($processed - $mean)^2" | bc)
done
echo "scale=10; sqrt($total / ($# - 1))" | bc
}
# Calculate mean and standard deviation for system grep times
system_grep_mean=$(calculate_mean "${system_grep_times[@]}")
system_grep_std=$(calculate_std_dev "$system_grep_mean" "${system_grep_times[@]}")
# Calculate mean and standard deviation for custom grep times
custom_grep_mean=$(calculate_mean "${custom_grep_times[@]}")
custom_grep_std=$(calculate_std_dev "$custom_grep_mean" "${custom_grep_times[@]}")
# Output the results
echo ""
echo "System grep times:"
# printf '%s\n' "${system_grep_times[@]}"
echo "Mean: ${system_grep_mean}s"
echo "Standard deviation: ${system_grep_std}s"
echo ""
echo "Custom grep times:"
# printf '%s\n' "${custom_grep_times[@]}"
echo "Mean: ${custom_grep_mean}s"
echo "Standard deviation: ${custom_grep_std}s"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment