We define the similarity of two input sequences A and B, denoted as Simlarity(A,B) which shown in below.
Using the formula of Similarity, we present a semi-random method to generate specific similarity string data.
Suppose that we are given a sequence A with |A| = m, |Σ|, the desired similarity value λ0 and the upper bound u of the number of execution iterations, where λi denotes the similarity value obtained in iteration i. Our semi-random method for generating sequence B with |B| = n that the similarity approximates to λ0 is presented as follows.
Calculate the desired LCS length, denoted as LCS0(A,B), from λ0 with Equation 1. And let i = 1.
Set B = A and calculate ci with Equation 1. Randomly change ci elements in sequence B
If n > m, randomly insert (n - m) characters in random positions of B.
Calculate the new LCSi(A,B) and similarity λi.
If λi 2 [λ0 - 0:02, λ0 + 0:02] or i ≥ u, output B and stop. Otherwise, set i = i + 1, and go to Step 2.
function gen_similar_int(&$A,&$B,$similar,$size,$Alen,$Blen,$ins,$del,$rep,$gennum,PDO $db_pdo) { $change = $Alen - (int)($Alen * $similar); //the number of element we need to insert a random element $answer = 0.0; //real similarity $go = true; $count = 0; $A[0] = -1; $B[0] = -2; $LCS; //length of LCS=(Alen+Blen-EditD)/2(if ins=1,del=1,rep=2) $trytimes = 100; //times to adjust string B to fetch the target similarity $tryary = array(); //record the flag of generated similarity $gap = array(); //record the position of B which we don't insert A's element $tA = array(); //record the string A after change some element for ($i=0; $i ≤ 20; $i++){ $tryary[$i]=0; } //generate a random string A for ($i = 1; $i ≤ $Alen; ++$i){ $A[$i] = rand(1,$size); } do{ $tA = $A; //random generate gap position of string B if ($Alen == Blen){ for ($i = 1; $i ≤ $Blen; ++$i){ $gap[$i] = 1; } } else{ for ($i = 1; $i ≤ $Blen; ++$i) { $gap[$i] = 0; } $diff = $Alen; while($diff > 0){ $temp = rand(1, $Blen); if($gap[$temp] == 1){ $gap[$temp]=0; $diff--; } } } //change some element in string A to satisfy the similarity we set for ($i = 1; $i ≤ $change; ++$i){ $tA[rand(1,$Alen)] = rand(1, $size); } //insert tA to string B according to the gap position $countA = 1; for ($i = 1; $i ≤ $Blen; ++$i) { if($gap[$i]){ $B[$i] = $tA[$countA]; ++$countA; } else{ $B[$i] =rand(1, $size); } } //computing the similarity of A and B $LCS = ($Alen + $Blen - EditD_NP($A, $B, $Alen, $Blen,$ins,$del,$rep))/2; //EditD_NP is an algorithm used to calculate edit distance $answer = $LCS / $Alen; //real similarity $TLCS = $similar*$Alen; //LCS length of target similarity //adjusting the number of element to change, if the real similarity is not similar to the similarity we set if ($answer > ($similar + 0.01)){ $LCSdiff = round($LCS-$TLCS); $resttimes = $trytimes-$count; if ($change + $LCSdiff >= $Alen){ if ($LCSdiff / $trytimes > 1){ $change += ($LCSdiff / $trytimes); } else { $change+=1; } } else { $change += round($LCS - $TLCS); } if ($change > $Alen){ $change = $Alen; } } else if ($answer < ($similar - 0.01)){ $LCSdiff=round($LCS-$TLCS); $resttimes=$trytimes-$count; if ($change - $LCSdiff ≤ 0){ if ($LCSdiff / $trytimes > 1){ $change -= ($LCSdiff / $trytimes); } else { $change -= 1; } } else { $change -= round($LCS - $TLCS); } if ($change < 0){ $change = 0; } } else if ($answer < ($similar + 0.01) && $answer > ($similar - 0.01)){ $go = false; } //check the similarity of this string A has been record or not $RS = round($answer * 100 / 5) * 5; $tablename = $Alen."_".($Blen/$Alen)."_".$size."_".$RS; if ($tryary[$RS/5] == 0){ $tryary[$RS/5] = 1; insert_to_DB($A, $B, $answer, $similar, $count, $gennum, $tablename, $db_pdo); //insert to DataBase } ++$count; if ($count > $trytimes) $go = false; } while ($go); return $answer; }