The pairwise sequence alignment is an essential tool in bioinformatics. For finding similar protein sequences in a database, we can use the global and local sequence alignment algorithms. Since performing pairwise sequence alignment requires very much time for all sequences in the database, we may apply the heuristic algorithms to reduce the required time, but slightly to decrease the accuracy. This thesis proposes an algorithm to quickly pick up feasible sequences with better alignment scores in the database. The proposed algorithm FastSearch can be adjusted by setting parameters to speed up the sequence searching. The FastSearch is compared with the GGSEARCH based on the optimal alignment score by the Needleman-Wunsch algorithm. In addition, the comparisons of alignment score with the GGSEARCH and the ClustalW are performed for various sequence identities. The experimental data sets are retrieved from the NCBI database, including the COVID-19 proteins, the EUMAT and the Human proteins. As the experimental results show, our FastSearch algorithm gets better accuracy and time efficiency than the GGSEARCH. In the comparison of alignment score for various identities, our BestJump algorithm also gets comparable scores and requires less time than the GGSEARCH and the ClustalW.
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;
public class Main {
public static void main(String[] args) throws FileNotFoundException, IOException{
//this file is the main important experiment code, do all full test
String queryName = "query.txt"; // query
String databaseName = "database.txt"; // database
String scoreName = "blosum62.txt"; // score matrix
int gapScore = -4; // gap penalty g
int fix_jump = 20; // d_fixed
int seed_num = 25; // n_seed
int firstStepFilterSplitSize = 5; // alpha
int secondStepFilterSplitSize = 2; // beta
int finalReturnSize = 5; // gamma
int[][] firstStepPara = {{1,4,2},{1,1,1}}; // set weight parameters for FastLCS stage
int[][] secondStepPara = {{0,1,0},{0,0,1},{0,1,1},{1,1,16}}; // set weight parameters for BestJump stage
FastSearch fs = new FastSearch(databaseName,scoreName); // create FastSearch object and read database and score matrix
fs.set_fixed(fix_jump); // set d_fixed
fs.set_seeds(seed_num); // set n_seed
fs.set_nwpara(gapScore); // set gap penalty
FastaRead queryDB = new FastaRead(queryName); // read query sequences
int queryDB_length = queryDB.dataName.length;
FileWriter fw = new FileWriter("NW-fiveReturn.csv"); // write return answer to the file
/*
Start to perform FastSearch algorithm for database sequences and each query
we will input each query to the FastSearch algorithm, and return the names of similar protein sequences
*/
for (int i = 0; i < queryDB_length; i++) {
// get the query
String query = queryDB.dataSequence[i];
// perform the FastSearch algorithm for query and database, and get the names of similar protein sequences
String[] get = fs.fastSearchDBwithReverse(query,firstStepFilterSplitSize,secondStepFilterSplitSize,firstStepPara,secondStepPara);
get = fs.fullNWSearchDB(query);
// write the return answer, data format is "query : ans1,ans2,ans3,ans4,ans5"
fw.write(queryDB.dataName[i]+" : ");
for (int j = 0; j < finalReturnSize; j++) {
fw.write(get[j]+",");
}
fw.write("\n");
}
fw.flush();
fw.close();
}
}