Question

I need to compare pHashes (phash.org) with a hamming distance function.

I tried the one from pg_similarity, but it doesn't seem to work right. (identical pHashes don't have a hamming distance of 0).

So I figured I'd just use a c-extension to use the ph_hamming_distance function that's part of the pHash library.

What I've got: phash.c

#include <postgres.h>
#include <pHash.h>
#include <fmgr.h>
#include <utils/bytea.h>
#include <utils/datum.h>

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

PG_FUNCTION_INFO_V1(phash_hamming);
Datum phash_hamming(PG_FUNCTION_ARGS) {

    bytea *bytea1 = PG_GETARG_BYTEA_P(0);
    bytea *bytea2 = PG_GETARG_BYTEA_P(1);
    //FIXME - length of bytea1 & bytea2 must be 4 bytes (64bits)

    ulong64 long1 = *((ulong64*) bytea1);
    ulong64 long2 = *((ulong64*) bytea2);

    int32 ret = ph_hamming_distance(long1, long2);

    PG_RETURN_INT32(ret);
}

Makefile

CXXFLAGS=-I/usr/include/postgresql/server
LDFLAGS=-Bstatic -lpHash
all: phash.o

phash.o:
    $(CXX) $(CXXFLAGS) -fpic -c phash.c
    $(CXX) $(LDFLAGS) -shared -o phash.so phash.o

install:
    cp phash.so `pg_config --pkglibdir`

clean:
    rm -f phash.o phash.so

SQL

 CREATE FUNCTION phash_hamming (bytea1 bytea, bytea2 bytea) RETURNS int AS '$libdir/phash' LANGUAGE C;

Error that I'm getting:

ERROR:  could not load library "/usr/lib/postgresql/phash.so": /usr/lib/postgresql/phash.so: undefined symbol: _Z16pg_detoast_datumP7varlena

I must not be linking right to postgresql somehow?

Was it helpful?

Solution

It's an old question, but...

  1. There is no need to add extra wrapper file and compile it using gcc.
  2. You need extern "C" both PostgreSQL headers and PostgreSQL macros.

    extern "C" {
      #include <postgres.h>
      #include <fmgr.h>
      #ifdef PG_MODULE_MAGIC
      PG_MODULE_MAGIC
      #endif
    }
    

OTHER TIPS

I'm still convinced there might be a better way but this is what I did that worked.

(I will add range-checking, instead of just assuming all bytea's are 4-bytes... eventually, leaving a potential segfault in production would be bad, so it's a good thing this is just a toy project)

phash.c - pure C file, compiled with gcc

#include <postgres.h>
#include <fmgr.h>
#include <utils/bytea.h>
#include <utils/datum.h>

//typedef unsigned __int64 ulong64;
#if defined(_MSC_VER) || defined(__BORLANDC__)
typedef unsigned __int64 ulong64;
#else
typedef unsigned long long ulong64;
#endif

extern int32 c_ph_hamming_distance (ulong64 b1, ulong64 b2);

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

PG_FUNCTION_INFO_V1(phash_hamming);
Datum phash_hamming(PG_FUNCTION_ARGS) {

    bytea *bytea1 = PG_GETARG_BYTEA_P(0);
    bytea *bytea2 = PG_GETARG_BYTEA_P(1);
    //FIXME - length of bytea1 & bytea2 must be 4 bytes (64bits)

    ulong64 long1 = *((ulong64*) bytea1);
    ulong64 long2 = *((ulong64*) bytea2);

    int32 ret = c_ph_hamming_distance(long1, long2);

    PG_RETURN_INT32(ret);
}

phash_wrapper.cpp - make convert a version of ph_hamming_distance with c-linking instead of cpp linking (compiled with g++)

#include <pHash.h>
extern "C" {
    int c_ph_hamming_distance (ulong64 b1, ulong64 b2){
        return ph_hamming_distance(b1, b2);
    }
}

Makefile

CFLAGS=-I/usr/include/postgresql/server
LDFLAGS=-lpHash
all: phash.so

phash_wrapper.o: phash_wrapper.cpp
    $(CXX) $(CXXFLAGS) -fpic -c phash_wrapper.cpp

phash.o: phash.c
    $(CC) $(CFLAGS) -fpic -c phash.c

phash.so: phash.o phash_wrapper.o
    $(CC) $(LDFLAGS) -shared -o phash.so phash.o phash_wrapper.o

install:
    cp phash.so `pg_config --pkglibdir`

clean:
    rm -f phash.o phash.so phash_wrapper.o

SQL - the same

CREATE FUNCTION phash_hamming (bytea1 bytea, bytea2 bytea) RETURNS int AS '$libdir/phash' LANGUAGE C;
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top