[postmodern-devel] Micro-optimizing SQL-ESCAPE-STRING, ESCAPE-BYTES

J.P. Larocque piranha at thoughtcrime.us
Sun Jul 29 19:17:23 UTC 2007


On Sat, 2007-07-28 at 18:28 +0200, Marijn Haverbeke wrote:
> Good catch. I applied a fix, more or less your patch but without the
> (unneccesary) gensym.

Right, cargo-cult habits at work.  =)

I'm doing some stuff involving binary data (geometry information with
PostGIS), and I've found encoding binary data with SQL-COMPILE is very
slow.  For instance, 91 seconds for a 50KiB (VECTOR (UNSIGNED-BYTE 8)).
The slowness is in S-SQL:ESCAPE-BYTES and S-SQL:SQL-ESCAPE-STRING.  It
turns out SBCL's WITH-OUTPUT-TO-STRING is quadratic in time, unless (as
Juho Snellman pointed out on #lisp) you bind *PRINT-PRETTY* to NIL.

These functions are linear on both CLISP and OpenMCL.

Before he mentioned that, I spent some time micro-optimizing these two
functions.  They're ugly--not entirely unlike hairier C code.  You could
say I ended up implementing a very specific, low-level, non-general form
of the functionality of WITH-OUTPUT-TO-STRING.

SQL-COMPILE on the same expression now takes 0.3 seconds, though, and
they are about four times faster than simply making that binding.  This
could be especially important for more heavy-weight users of binary data
in PostgreSQL--say, back-ends for object databases.  (My own requirement
for large binary data is temporary.)

If you're interested, I've attached a file with these functions and some
comparison and testing functions (even uglier).  If nothing else, could
you bind *PRINT-PRETTY* to NIL for the above two functions?

Thanks,

-- 
J.P. Larocque: <piranha at thoughtcrime.us>, <piranha at ely.ath.cx>
-------------- next part --------------
;;; These are the efficient, albeit hairy and micro-optimized versions
;;; of ESCAPE-BYTES and SQL-ESCAPE-STRING.

;;; These functions were tested on the following implementations.
;;; Rates are from the measurement of (SQL-ESCAPE-STRING (ESCAPE-BYTES
;;; v)), without OPTIMIZE declarations, where v is a vector of
;;; 10,000,000 random octets, on a 2GHz Athlon64.
;;;   SBCL 0.9.16 x86_64: 860,807 octets/sec
;;;   OpenMCL 1.1-pre-070512 LinuxX8664: 627,077 octets/sec
;;;   CLISP 2.41 x86_64: 18,243 octets/sec
;;;     (To its credit, CLISP used under 100MiB of RAM, where the
;;;     others used over ~200MiB.)

(defmacro defencoder (name &key get-length set-string documentation
			   (prefix "") (suffix "")
			   (element-type t) (character-type 'character))
  "Defines a function of the given NAME which encodes an input vector
to a string.

The resulting string always starts with the contents of PREFIX and
always ends with the contents of SUFFIX.

Each element of the defined function's input is mapped to zero or more
characters by the given GET-LENGTH and SET-STRING functions.
GET-LENGTH takes one argument, an element, and returns the number of
characters required to encode it.  SET-STRING takes three
arguments--an element, a string, and a position--which is required to
write the encoded form of the element to the given string at the given
position, and then return the number of characters written.

This unfortunately low-level interface was selected to avoid consing
and slow implementations of WITH-OUTPUT-TO-STRING.

The type of the elements of the input vector may be specified with
ELEMENT-TYPE.

The resulting string will be a subtype of `(VECTOR ,CHARACTER-TYPE).
When CHARACTER-TYPE is supplied, SET-STRING must write only characters
of that type."
  (when (or (null get-length) (null set-string))
    (error "Must supply :GET-LENGTH and :SET-STRING."))
  `(let ((get-length-fn ,get-length)
	 (set-string-fn ,set-string)
	 (prefix ,prefix)
	 (suffix ,suffix))
    (defun ,name (input-vec)
      ,@(if (null documentation)
	    '()
	    `(,documentation))
      (let* ((output-length
	      (+ (loop :for element :of-type ,element-type :across input-vec
		       :sum (funcall get-length-fn element))
		 (length prefix)
		 (length suffix)))
	     (output (make-array output-length
				 :element-type ',character-type
				 ;; #\Space for lack of something better.
				 :initial-element #\Space))
	     (output-pos 0))
	(flet ((add-string (s)
		 (setf (subseq output output-pos (+ output-pos (length s)))
		       s)
		 (incf output-pos (length s))))
	  (add-string prefix)
	  (loop :for element :of-type ,element-type :across input-vec
		:for written-count := (funcall set-string-fn element output output-pos)
		:do (incf output-pos written-count))
	  (add-string suffix))
	output))))

(defencoder escape-bytes
    :documentation "Escape an array of octets in PostgreSQL's horribly
inefficient textual format for binary data."
    ;; STANDARD-CHAR contains backslash, digits, and all the ASCII
    ;; characters we pass unescaped.
    :element-type (unsigned-byte 8) :character-type standard-char
    :get-length (lambda (byte)
		  (if (or (< byte 32) (> byte 126) (= byte 39) (= byte 92))
		      4 ; "\ooo"
		      1))
    :set-string (lambda (byte s start)
		  (cond ((or (< byte 32) (> byte 126) (= byte 39) (= byte 92))
			 ;; Write "\ooo".
			 (let ((place64 (floor byte 64))
			       (place8 (mod (floor byte 8) 8))
			       (place1 (mod byte 8)))
			   (setf (elt s (+ start 0)) #\\)
			   (setf (elt s (+ start 1)) (digit-char place64 8))
			   (setf (elt s (+ start 2)) (digit-char place8 8))
			   (setf (elt s (+ start 3)) (digit-char place1 8))
			   4))
			(t (setf (elt s start) (code-char byte))
			   1))))

(defencoder sql-escape-string
    :documentation "Escape string data so it can be used in a query."
    :prefix "'" :suffix "'" :element-type character
    :get-length (lambda (char)
		  (if (member char '(#\' #\\) :test #'char=)
		      2 ; "''" or "\\\\", respectively.
		      1))
    :set-string (lambda (char s start)
		  (case char
		    (#\' ;; Write "''".
		     (setf (elt s start) #\')
		     (setf (elt s (1+ start)) #\')
		     2)
		    (#\\
		    ;; Write "\\\\".  Turn off postgres' backslash
		    ;; behaviour to prevent unexpected strangeness.
		     (setf (elt s start) #\\)
		     (setf (elt s (1+ start)) #\\)
		     2)
		    (t (setf (elt s start) char)
		       1))))

;;; These versions--the originals--take quadratic time on SBCL,
;;; because of WITH-OUTPUT-TO-STRING.  That's SBCL's fault; with
;;; OpenMCL, the time is linear.

(defun escape-bytes-orig (bytes)
  "Escape an array of octets in PostgreSQL's horribly inefficient
textual format for binary data."
  (with-output-to-string (out)
    (loop :for byte :of-type fixnum :across bytes
          :do (if (or (< byte 32) (> byte 126) (= byte 39) (= byte 92))
                  (format out "\\~3,'0o" byte)
                  (princ (code-char byte) out)))))

(defun sql-escape-string-orig (string)
  "Escape string data so it can be used in a query."
  (with-output-to-string (out)
    (princ #\' out)
    (loop :for char :of-type character :across string
          :do (princ (case char
                       (#\' "''")
                       ;; Turn off postgres' backslash behaviour to
                       ;; prevent unexpected strangeness.
                       (#\\ "\\\\")
                       (t char)) out))
    (princ #\' out)))

;;; Some utility functions for comparing the above.

(defun time-escape-function (escape-f input-vec)
	(let* ((start-real-time (get-internal-real-time))
	       (start-run-time (get-internal-run-time))
	       (test-result (time (funcall escape-f input-vec))))
	  (let* ((end-real-time (get-internal-real-time))
		 (end-run-time (get-internal-run-time))
		 (real-time (/ (- end-real-time start-real-time)
			       internal-time-units-per-second))
		 (run-time (/ (- end-run-time start-run-time)
			      internal-time-units-per-second))
		 (real-time-rate (unless (zerop real-time)
				   (/ (length input-vec) real-time)))
		 (run-time-rate (unless (zerop run-time)
				  (/ (length input-vec) run-time))))
	    (format t "Real time: ~,1F (~,1F elt/sec)~%Run time: ~,1F (~,1F elt/sec)~%"
		    real-time real-time-rate run-time run-time-rate)
	    test-result)))

(defun compare-escape-functions (input-size &rest escape-functions)
	(let ((test-vec (make-array `(,input-size) :element-type '(unsigned-byte 8)
				    :initial-element 0))
	      last-result)
	  (loop for i from 0 below input-size
		doing (setf (elt test-vec i) (random 256)))
	  (dolist (escape-function escape-functions)
	    (format t "~S:~%" escape-function)
	    (let ((result (time-escape-function escape-function test-vec)))
	      (unless (null last-result)
		(if (equalp result last-result)
		    (format t "Results match.~%")
		    (format t "*** Results do not match.~%")))
	      (terpri)
	      (setf last-result result))))
	(values))

(defvar *failed-vector* nil
  ;; We have this because the failing vector might be huge and too
  ;; cumbersome for a condition or result value to a REPL.
  "The last vector that failed within TEST-ESCAPE-FUNCTIONS.")

(defun test-escape-functions (type initial-element key elt-upper-bound escape-f-1 escape-f-2)
  (labels ((test-vector (vector desc &rest arguments)
	     (let ((result1 (funcall escape-f-1 vector))
		   (result2 (funcall escape-f-2 vector)))
	       (unless (equalp result1 result2)
		 (setf *failed-vector* vector)
		 (error "~S and ~S return different results for: ~?"
			escape-f-1 escape-f-2 desc arguments)))))
    (test-vector (make-array 0 :element-type type)
		 "empty vector")
    (loop :for len :from 1 :below 10
	  :do (loop :for i :from 0 :below elt-upper-bound
		    :for elt := (funcall key i)
		    :do (unless (null elt)
			  (let ((vector (make-array len :element-type type
						    :initial-element elt)))
			    (test-vector vector "length ~D with element ~S" len elt)))))
    (loop :for len := (random 10)
	  :for vector := (make-array len :element-type type
				     :initial-element initial-element)
	  :repeat 25
	  :do (loop :for i :from 0 :below len
		    :for elt := (loop :for eltx := (funcall key (random elt-upper-bound))
				      :while (null eltx)
				      :finally (return eltx))
		    :doing (setf (elt vector i) elt))
	  :do (test-vector vector "~D random element~:P" len))))


;;; On Lisp, 5.4: Composing Functions
(defun compose (&rest functions)
  (if (endp functions)
      #'identity
      (let ((function1 (car (last functions)))
	    (functions (butlast functions)))
	(lambda (&rest args)
	  (reduce #'funcall functions
		  :from-end t
		  :initial-value (apply function1 args))))))

(defun run-tests ()
  (format t "Testing ESCAPE-BYTES{,-ORIG}.~%")
  (test-escape-functions '(unsigned-byte 8) 0 #'identity 256
			 #'escape-bytes #'escape-bytes-orig)

  ;; Cap at 64K to avoid going through 1M+ Unicode characters.
  (format t "Testing SQL-ESCAPE-STRING{,-ORIG}.~%")
  (test-escape-functions 'character #\Space #'code-char (min char-code-limit 65536)
			 #'sql-escape-string #'sql-escape-string-orig)

  (format t "Testing the composition of SQL-ESCAPE-STRING ESCAPE-BYTES with ...-ORIG.~%")
  (test-escape-functions '(unsigned-byte 8) 0 #'identity 256
			 (compose #'sql-escape-string #'escape-bytes)
			 (compose #'sql-escape-string-orig #'escape-bytes-orig))

  (format t "No problems to report."))


More information about the postmodern-devel mailing list