When inlining assembly in gcc, I find myself regularly having to add empty asm blocks in order to keep variables alive in earlier blocks, for example:
asm("rcr $1,%[borrow];"
    "movq 0(%[b_],%[i],8),%%rax;"
    "adcq %%rax,0(%[r_top],%[i],8);"
    "rcl $1,%[borrow];"
    : [borrow]"+r"(borrow)
    : [i]"r"(i),[b_]"r"(b_.data),[r_top]"r"(r_top.data)
    : "%rax","%rdx");
asm("" : : "r"(borrow) : ); // work-around to keep borrow alive ...
Another example of weirdness is that the code below works great without optimizations, but with -O3 it seg-faults:
ulong carry = 0,hi = 0,qh = s.data[1],ql = s.data[0];
asm("movq 0(%[b]),%%rax;"
    "mulq %[ql];"
    "movq %%rax,0(%[sb]);"
    "movq %%rdx,%[hi];"
    : [hi]"=r"(hi)
    : [ql]"r"(ql),[b]"r"(b.data),[sb]"r"(sb.data)
    : "%rax","%rdx","memory");
for (long i = 1; i < b.size; i++)
{
    asm("movq 0(%[b],%[i],8),%%rax;"
        "mulq %[ql];"
        "xorq %%r10,%%r10;"
        "addq %%rax,%[hi];"
        "adcq %%rdx,%[carry];"
        "adcq $0,%%r10;"
        "movq -8(%[b],%[i],8),%%rax;"
        "mulq %[qh];"
        "addq %%rax,%[hi];"
        "adcq %%rdx,%[carry];"
        "adcq $0,%%r10;"
        "movq %[hi],0(%[sb],%[i],8);"
        "movq %[carry],%[hi];"
        "movq %%r10,%[carry];"
        : [carry]"+r"(carry),[hi]"+r"(hi)
        : [i]"r"(i),[ql]"r"(ql),[qh]"r"(qh),[b]"r"(b.data),[sb]"r"(sb.data)
        : "%rax","%rdx","%r10","memory");
}
asm("movq -8(%[b],%[i],8),%%rax;"
    "mulq %[qh];"
    "addq %%rax,%[hi];"
    "adcq %%rdx,%[carry];"
    "movq %[hi],0(%[sb],%[i],8);"
    "movq %[carry],8(%[sb],%[i],8);"
    : [hi]"+r"(hi),[carry]"+r"(carry)
    : [i]"r"(long(b.size)),[qh]"r"(qh),[b]"r"(b.data),[sb]"r"(sb.data)
    : "%rax","%rdx","memory");
I think it has to do with the fact that it's using so many registers. Is there something I'm missing here or is the register allocation just really buggy with gcc inline assembly?