This snippet is about sorting a set of strings using merge sort. However, this version is tailored for strings as it relies on lcp
values:
Given two strings \$A\$ and \$B\$, \$lcp(A, B)\$ is the length of the longest prefix shared by both \$A\$ and \$B\$. The cool point to note here is that if you know at least a lower bound of \$lcp(A, B)\$ (say, \$k < lcp(A, B)\$), whenever comparing \$A\$ and \$B\$, you can skip the first \$k\$ characters, and this string merge sort relies on these values in order to speed up the computation.
StringMergesort.java:
package net.coderodde.util; import java.util.Arrays; import java.util.Random; public class StringMergesort { private static final int ALPHABET_SIZE = 26; private static final class ComparisonResult { int cmpValue; int lcp; } public static void sort(String[] array) { sort(array, 0, array.length); } public static void sort(String[] array, int fromIndex, int toIndex) { if (toIndex - fromIndex < 2) { return; } String[] auxArray = array.clone(); int[] sourceLCPArray = new int[auxArray.length]; int[] targetLCPArray = new int[auxArray.length]; ComparisonResult result = new ComparisonResult(); sortImpl(auxArray, array, sourceLCPArray, targetLCPArray, fromIndex, toIndex, result); } private static void lcpCompare(String a, String b, int k, ComparisonResult comparisonResult) { int length = Math.min(a.length(), b.length()); for (int i = k; i < length; ++i) { char ach = a.charAt(i); char bch = b.charAt(i); if (ach != bch) { comparisonResult.cmpValue = Character.compare(ach, bch); comparisonResult.lcp = i; return; } } comparisonResult.lcp = length; if (a.length() > length) { comparisonResult.cmpValue = 1; } else if (a.length() < length) { comparisonResult.cmpValue = -1; } else { comparisonResult.cmpValue = 0; } } private static void sortImpl(String[] source, String[] target, int[] sourceLCPArray, int[] targetLCPArray, int fromIndex, int toIndex, ComparisonResult result) { int rangeLength = toIndex - fromIndex; if (rangeLength < 2) { return; } int middle = fromIndex + ((toIndex - fromIndex) >> 1); sortImpl(target, source, targetLCPArray, sourceLCPArray, fromIndex, middle, result); sortImpl(target, source, targetLCPArray, sourceLCPArray, middle, toIndex, result); merge(source, target, sourceLCPArray, targetLCPArray, fromIndex, middle - fromIndex, toIndex - middle, result); } private static void merge(String[] source, String[] target, int[] sourceLCPArray, int[] targetLCPArray, int offset, int leftRangeLength, int rightRangeLength, ComparisonResult result) { int left = offset; int leftBound = offset + leftRangeLength; int right = leftBound; int rightBound = right + rightRangeLength; int targetIndex = offset; while (left < leftBound && right < rightBound) { int leftLCP = sourceLCPArray[left]; int rightLCP = sourceLCPArray[right]; if (leftLCP > rightLCP) { target[targetIndex] = source[left]; targetLCPArray[targetIndex++] = sourceLCPArray[left++]; } else if (rightLCP > leftLCP) { target[targetIndex] = source[right]; targetLCPArray[targetIndex++] = sourceLCPArray[right++]; } else { lcpCompare(source[left], source[right], leftLCP, result); if (result.cmpValue <= 0) { target[targetIndex] = source[left]; targetLCPArray[targetIndex++] = sourceLCPArray[left++]; sourceLCPArray[right] = result.lcp; } else { target[targetIndex] = source[right]; targetLCPArray[targetIndex++] = sourceLCPArray[right++]; sourceLCPArray[left] = result.lcp; } } } while (left < leftBound) { target[targetIndex] = source[left]; targetLCPArray[targetIndex] = sourceLCPArray[left]; targetIndex++; left++; } while (right < rightBound) { target[targetIndex] = source[right]; targetLCPArray[targetIndex] = sourceLCPArray[right]; targetIndex++; right++; } } public static void main(String[] args) { long seed = System.nanoTime(); Random random = new Random(seed); String[] array1 = createRandomStringArray(500_000, 300, random); String[] array2 = array1.clone(); System.out.println("Seed = " + seed); long startTime = System.nanoTime(); Arrays.sort(array1); long endTime = System.nanoTime(); System.out.format("Arrays.sort in %.2f milliseconds.\n", (endTime - startTime) / 1e6); startTime = System.nanoTime(); StringMergesort.sort(array2); endTime = System.nanoTime(); System.out.format("StringMergesort.sort in %.2f milliseconds.\n", (endTime - startTime) / 1e6); System.out.println("Arrays equal: " + Arrays.equals(array1, array2)); } private static String[] createRandomStringArray(int size, int maxLength, Random random) { String[] ret = new String[size]; for (int i = 0; i < size; ++i) { ret[i] = randomString(maxLength, random); } return ret; } private static String randomString(int maxLength, Random random) { int length = random.nextInt(maxLength + 1); StringBuilder sb = new StringBuilder(length); for (int i = 0; i < length; ++i) { sb.append((char)('a' + random.nextInt(ALPHABET_SIZE))); } return sb.toString(); } }
My performance figures are:
Seed = 5896859395728 Arrays.sort in 1220.41 milliseconds. StringMergesort.sort in 669.97 milliseconds. Arrays equal: true
Is there anything to improve here? Once again, I would like to hear comments on naming conventions, coding style, performance, and optimisation opportunities.