We define the similarity of two input sequences A and B, denoted as Simlarity(A,B) which shown in below.
Using the formula of Similarity, we present a semi-random method to generate specific similarity string data.
Suppose that we are given a sequence A with |A| = m, |Σ|, the desired similarity value λ0 and the upper bound u of the number of execution iterations, where λi denotes the similarity value obtained in iteration i. Our semi-random method for generating sequence B with |B| = n that the similarity approximates to λ0 is presented as follows.
Calculate the desired LCS length, denoted as LCS0(A,B), from λ0 with Equation 1. And let i = 1.
Set B = A and calculate ci with Equation 1. Randomly change ci elements in sequence B
If n > m, randomly insert (n - m) characters in random positions of B.
Calculate the new LCSi(A,B) and similarity λi.
If λi 2 [λ0 - 0:02, λ0 + 0:02] or i ≥ u, output B and stop. Otherwise, set i = i + 1, and go to Step 2.
function gen_similar_int(&$A,&$B,$similar,$size,$Alen,$Blen,$ins,$del,$rep,$gennum,PDO $db_pdo)
{
$change = $Alen - (int)($Alen * $similar); //the number of element we need to insert a random element
$answer = 0.0; //real similarity
$go = true;
$count = 0;
$A[0] = -1;
$B[0] = -2;
$LCS; //length of LCS=(Alen+Blen-EditD)/2(if ins=1,del=1,rep=2)
$trytimes = 100; //times to adjust string B to fetch the target similarity
$tryary = array(); //record the flag of generated similarity
$gap = array(); //record the position of B which we don't insert A's element
$tA = array(); //record the string A after change some element
for ($i=0; $i ≤ 20; $i++){
$tryary[$i]=0;
}
//generate a random string A
for ($i = 1; $i ≤ $Alen; ++$i){
$A[$i] = rand(1,$size);
}
do{
$tA = $A;
//random generate gap position of string B
if ($Alen == Blen){
for ($i = 1; $i ≤ $Blen; ++$i){
$gap[$i] = 1;
}
}
else{
for ($i = 1; $i ≤ $Blen; ++$i) {
$gap[$i] = 0;
}
$diff = $Alen;
while($diff > 0){
$temp = rand(1, $Blen);
if($gap[$temp] == 1){
$gap[$temp]=0; $diff--;
}
}
}
//change some element in string A to satisfy the similarity we set
for ($i = 1; $i ≤ $change; ++$i){
$tA[rand(1,$Alen)] = rand(1, $size);
}
//insert tA to string B according to the gap position $countA = 1;
for ($i = 1; $i ≤ $Blen; ++$i) {
if($gap[$i]){
$B[$i] = $tA[$countA];
++$countA;
}
else{
$B[$i] =rand(1, $size);
}
}
//computing the similarity of A and B
$LCS = ($Alen + $Blen - EditD_NP($A, $B, $Alen, $Blen,$ins,$del,$rep))/2; //EditD_NP is an algorithm used to calculate edit distance
$answer = $LCS / $Alen; //real similarity
$TLCS = $similar*$Alen; //LCS length of target similarity
//adjusting the number of element to change, if the real similarity is not similar to the similarity we set
if ($answer > ($similar + 0.01)){
$LCSdiff = round($LCS-$TLCS);
$resttimes = $trytimes-$count;
if ($change + $LCSdiff >= $Alen){
if ($LCSdiff / $trytimes > 1){
$change += ($LCSdiff / $trytimes);
}
else {
$change+=1;
}
}
else {
$change += round($LCS - $TLCS);
}
if ($change > $Alen){
$change = $Alen;
}
}
else if ($answer < ($similar - 0.01)){
$LCSdiff=round($LCS-$TLCS);
$resttimes=$trytimes-$count;
if ($change - $LCSdiff ≤ 0){
if ($LCSdiff / $trytimes > 1){
$change -= ($LCSdiff / $trytimes);
}
else {
$change -= 1;
}
}
else {
$change -= round($LCS - $TLCS);
}
if ($change < 0){
$change = 0;
}
}
else if ($answer < ($similar + 0.01) && $answer > ($similar - 0.01)){
$go = false;
}
//check the similarity of this string A has been record or not
$RS = round($answer * 100 / 5) * 5;
$tablename = $Alen."_".($Blen/$Alen)."_".$size."_".$RS;
if ($tryary[$RS/5] == 0){
$tryary[$RS/5] = 1;
insert_to_DB($A, $B, $answer, $similar, $count, $gennum, $tablename, $db_pdo); //insert to DataBase
}
++$count;
if ($count > $trytimes)
$go = false;
}
while ($go);
return $answer;
}