The situation:
Let's say I have an image A, say, 512x512 pixels, and image B, 5x5 or 7x7 pixels.
Both images are 24bit rgb, and B have 1bit alpha mask (so each pixel is either completely transparent or completely solid). 
I need to find within image A a pixel which (with its' neighbors) most closely resembles image B, OR the pixel that probably most closely resembles image B.
Resemblance is calculated as "distance" which is sum of "distances" between non-transparent B's pixels and A's pixels divided by number of non-transparent B's pixels. Here is a sample SDL code for explanation:
struct Pixel{
    unsigned char b, g, r, a;
};
void fillPixel(int x, int y, SDL_Surface* dst, SDL_Surface* src, int dstMaskX, int dstMaskY){
    Pixel& dstPix = *((Pixel*)((char*)(dst->pixels) + sizeof(Pixel)*x + dst->pitch*y));
    int xMin = x + texWidth - searchWidth;
    int xMax = xMin + searchWidth*2;
    int yMin = y + texHeight - searchHeight;
    int yMax = yMin + searchHeight*2;
    int numFilled = 0;
    for (int curY = yMin; curY < yMax; curY++)
        for (int curX = xMin; curX < xMax; curX++){
            Pixel& cur = *((Pixel*)((char*)(dst->pixels) + sizeof(Pixel)*(curX & texMaskX) + dst->pitch*(curY & texMaskY)));
            if (cur.a != 0)
                numFilled++;
        }
    if (numFilled == 0){
        int srcX = rand() % src->w;
        int srcY = rand() % src->h;
        dstPix = *((Pixel*)((char*)(src->pixels) + sizeof(Pixel)*srcX + src->pitch*srcY));
        dstPix.a = 0xFF;
        return;
    }
    int storedSrcX = rand() % src->w;
    int storedSrcY = rand() % src->h;
    float lastDifference = 3.40282347e+37F;
    //unsigned char mask = 
    for (int srcY = searchHeight; srcY < (src->h - searchHeight); srcY++)
        for (int srcX = searchWidth; srcX < (src->w - searchWidth); srcX++){
            float curDifference = 0;
            int numPixels = 0;
            for (int tmpY = -searchHeight; tmpY < searchHeight; tmpY++)
                for(int tmpX = -searchWidth; tmpX < searchWidth; tmpX++){
                    Pixel& tmpSrc = *((Pixel*)((char*)(src->pixels) + sizeof(Pixel)*(srcX+tmpX) + src->pitch*(srcY+tmpY)));
                    Pixel& tmpDst = *((Pixel*)((char*)(dst->pixels) + sizeof(Pixel)*((x + dst->w + tmpX) & dstMaskX) + dst->pitch*((y + dst->h + tmpY) & dstMaskY)));
                    if (tmpDst.a){
                        numPixels++;
                        int dr = tmpSrc.r - tmpDst.r;
                        int dg = tmpSrc.g - tmpDst.g;
                        int db = tmpSrc.g - tmpDst.g;
                        curDifference += dr*dr + dg*dg + db*db;
                    }
                }
            if (numPixels)
                curDifference /= (float)numPixels;
            if (curDifference < lastDifference){
                lastDifference = curDifference;
                storedSrcX = srcX;
                storedSrcY = srcY;
            }
        }
    dstPix = *((Pixel*)((char*)(src->pixels) + sizeof(Pixel)*storedSrcX + src->pitch*storedSrcY));
    dstPix.a = 0xFF;
}
This thing is supposed to be used for texture generation.
Now, the question:
The easiest way to do this is brute force search (which is used in example routine). But it is slow - even using GPU acceleration and dual core cpu won't make it much faster. It looks like I can't use modified binary search because of B's mask. So, how can I find desired pixel faster? 
Additional Info:  
It is allowed to use 2 cores, GPU acceleration, CUDA, and 1.5..2 gigabytes of RAM for the task.  
I would prefer to avoid some kind of lengthy preprocessing phase that will take 30 minutes to finish.
Ideas?