Re #307: better algorithm for searching HOT.

Lots of comments after the code. While this patch does not cause any performance gain, it is good for marking possible improvements. Food for thought. git-svn-id: https://svn.code.sf.net/p/speed-dreams/code/trunk@4170 30fe4595-0a0c-4342-8851-515496e4dcbd Former-commit-id: a58e91654a42eb222f5f415ce49232be8b8b05e8 Former-commit-id: e7c837b479b764933fa09c9f884df4fa0dfaf0f6
2011-11-21 08:12:39 +00:00 · 2011-11-21 08:12:39 +00:00 · 1d322bb321
commit 1d322bb321
parent e26f6fe99c
1 changed files with 77 additions and 18 deletions
--- a/src/modules/graphic/ssggraph/grutil.cpp
+++ b/src/modules/graphic/ssggraph/grutil.cpp
@ -20,6 +20,7 @@
 #include <plib/ssg.h>

 #include <tgfclient.h>
+#include <algorithm>

 #include "grutil.h"
 #include "grscene.h"
@ -278,37 +279,95 @@ void grWriteTimeBuf(char *buf, tdble sec, int sgn)
 }


-// TODO: more efficient solution, this one is slow.
-float grGetHOT(float x, float y)
-{
-	sgVec3 test_vec;
+static inline float getPolyHOT(ssgHit& h) {
+  return h.plane[2] == 0.0 ? 0.0 : - h.plane[3] / h.plane[2];
+  }
+
+// Binary comparison of 2 polys to decide which one is
+//  lower over terrain. Used for max_element().
+static inline bool HOTless(ssgHit& g, ssgHit& h) {
+  return getPolyHOT(g) < getPolyHOT(h);
+}
+
+// Get height over terrain => hence HOT
+float grGetHOT(float x, float y) {
 	sgMat4 invmat;
 	sgMakeIdentMat4(invmat);

 	invmat[3][0] = -x;
 	invmat[3][1] = -y;
-	invmat[3][2] =  0.0f         ;
+	invmat[3][2] =  0.0f;

-	test_vec [0] = 0;
-	test_vec [1] = 0;
-	test_vec [2] = 100000.0f;
+	sgVec3 test_vec = { 0 , 0 , 100000.0f };

 	ssgHit *results;
 	int num_hits = ssgHOT (TheScene, test_vec, invmat, &results);
-	float hot = -1000000.0f;
+	ssgHit *h = std::max_element(&results[0], &results[num_hits-1], HOTless);
+	return getPolyHOT(*h);
+} //  grGetHOT

-	for (int i = 0; i < num_hits; i++) {
-		ssgHit *h = &results[i];
+/*
+ * NB: this solution is not faster, only cleaner
+ * than a pure C-way implementation:

-		float hgt = (h->plane[2] == 0.0 ? 0.0 : - h->plane[3] / h->plane[2]);
-
-		if (hgt >= hot) {
-			hot = hgt;
-		}
+  	ssgHit* first = &results[0];
+	ssgHit* last = &results[num_hits-1];
+	ssgHit* largest = first;
+	if (first == last) return getPolyHOT(*last);
+	while (++first != last) {
+      if (getPolyHOT(*largest) < getPolyHOT(*first))
+        largest = first;
 	}
+    return getPolyHOT(*largest);

-	return hot;
-}
+ * The C++ STL docs tells about max_element:
+ * Complexity
+ * Linear: Performs as many comparisons as the number of elements
+ * in [first,last), except for first.
+ *
+ * However if needed, it can be improved like this:
+ *
+ * http://www.physicsforums.com/showthread.php?t=512668
+ * Finding the maximum value in an array is a classic reduction problem;
+ * it's the same as finding the sum, average, etc., just using a
+ * different "binary operator" (one which returns the maximum of the
+ * two arguments). That being said, given a fast (or the fastest) way
+ * to compute the sum of an array of arguments in CUDA, the same
+ * algorithm should be a fast (or the fastest) way to compute the
+ * maximum value in the array.
+ *
+ * So I'm seeing this as a multi-phase solution. In the first phase,
+ * each block should compute the maximum value of the array
+ * corresponding to that block. In the second phase, some subset of
+ * the blocks should compute the maximum values from the computed
+ * maxima of all the blocks that did work in the first phase.
+ * Repeat until only one block is considering all the maximal values,
+ * and the result of this is guaranteed to be the maximum array value.
+ * You can consider things like shared memory, data distribution, etc.
+ * to increase performance.

+Example: A = {3, 10, 1, 9, 2, 8, 3, 4, 8, 2, 1, 7, 2, 5, 6, 1, 2, 5, 3, 2}
+Using 5 blocks to handle 4 elements each:

+Phase 1:
+{3, 10, 1, 9} {2, 8, 3, 4} {8, 2, 1, 7} {2, 5, 6, 1} {2, 5, 3, 2}
+{10, 10, 9, 9} {8, 8, 4, 4} {8, 2, 7, 7} {5, 5, 6, 1} {5, 5, 3, 2}
+{10, 10, 9, 9} {8, 8, 4, 4} {8, 2, 7, 7} {6, 5, 6, 1} {5, 5, 3, 2}

+Phase 2:
+{10, 8, 8, 6} {5}
+{10, 8, 8, 6} {5}
+{10, 8, 8, 6} {5}
+
+Phase 3:
+{10, 5}
+{10, 5}
+
+Maximum is 10
+
+ * Question is: can it bring any real performance plus?
+ * This grGetHOT() is only used in the F10 camera view.
+ * Should look up if there are any other source files that use
+ * a similar approach to find max, sum, avg or min value in an array.
+ *
+*/