ystem Call Interposition: how to implement virtualization

来源:互联网 发布:怪物猎人ol 知乎 编辑:程序博客网 时间:2024/06/05 20:38

A System Call Interposition (SCI) support tracks all the system service requests of processes.Each system request can be modified or denied.

It is possible to implement tools to trace, monitor, or virtualize processes.

This posting shows three different ways to implement a System Call Interposition service.The simple virtualization problem to hide the contents of the file /etc/passwd will be implementedby each SCI service, showing pros and cons of each proposal.

This example can also be used as a proof-of-concept test to propose others services for SCI.

Contents

  • 1The example
  • 2Purelibc
  • 3ptrace
  • 4kmview.ko (based on utrace)
  • 5Basic Performance Evaluation

The example

When a process tries to open the file "/etc/passwd" the system call must fail returning errno=ENOENT.

Purelibc

#define _GNU_SOURCE#include <stdio.h>#include <string.h>#include <stdarg.h>#include <sys/syscall.h>#include <unistd.h>#include <purelibc.h>#include <errno.h>static sfun _native_syscall;static char buf[128];static long int mysc(long int sysno, ...){  va_list ap;  long int a1,a2,a3,a4,a5,a6;  va_start (ap, sysno);  a1=va_arg(ap,long int);  a2=va_arg(ap,long int);  a3=va_arg(ap,long int);  a4=va_arg(ap,long int);  a5=va_arg(ap,long int);  a6=va_arg(ap,long int);  va_end(ap);  if (sysno == __NR_open) {    char *path=(char *)a1;    if (a1 && strcmp(path,"/etc/passwd")==0) {      errno=ENOENT;      return -1;    }  }  return _native_syscall(sysno,a1,a2,a3,a4,a5,a6);}  void  __attribute ((constructor))init_test (void){  _native_syscall=_pure_start(mysc,NULL,PUREFLAG_STDALL);}

Compile this source code (sci_purelibc.c):

 gcc -shared -o sci_purelibc.so sci_purelibc.c

preload purelibc and this shared object:

 export LD_PRELOAD=libpurelibc.so:/tmp/tests/syscall_interposition/sci_purelibc.so

and now /etc/passwd has disappeared

 $cat /etc/passwd cat: /etc/passwd: No such file or directory

Requirements: depends on the purelibc library

Pros: very fast.

Cons: unsafe (can be easily cincunvented), it works only for dynamically linked executables.

ptrace

#include  <sys/ptrace.h>#include  <sys/types.h>#include  <sys/wait.h>#include  <unistd.h>#include  <stdio.h>#include  <limits.h>#include  <errno.h>#include  <sys/user.h>#include  <asm/ptrace-abi.h>#include  <asm/unistd.h>int main(int argc, char *argv[]){  pid_t child;  long orig_eax;  child = fork();  if(child == 0) {    ptrace(PTRACE_TRACEME, 0, NULL, NULL);    argv++;    execvp(argv[0],argv);  }  else {    int status;    int gotpasswd=0;    int out=0;    while(1) {      waitpid(child,&status,0);      if(WIFEXITED(status) || WIFSIGNALED(status))        break;      orig_eax = ptrace(PTRACE_PEEKUSER, child, 4 * ORIG_EAX, NULL);      if (gotpasswd == 0) {        if (orig_eax == __NR_open) {          if (out==0) {            char path[PATH_MAX];            int i;            long pathaddr=ptrace(PTRACE_PEEKUSER, child, 4 * EBX, NULL);            errno=0;            for (i=0; i<PATH_MAX; i++) {              if ((i&0x3) == 0) {                long chunk=ptrace(PTRACE_PEEKDATA, child, (char *)(pathaddr+i), 0);                if (errno != 0)                  break;                * ((long *) (&path[i])) = chunk;              }              if (path[i] == 0)                break;            }            if (strcmp(path,"/etc/passwd")==0) {              ptrace(PTRACE_POKEUSER, child, 4 * ORIG_EAX, __NR_getpid);              gotpasswd=1;            }          }          out = 1-out;        }      } else {        ptrace(PTRACE_POKEUSER, child, 4 * EAX, -ENOENT);        gotpasswd=out=0;      }      ptrace(PTRACE_SYSCALL, child, NULL, NULL);    }  }  return 0;}

Compile the source code (sci_ptrace.c)

 gcc -o sci_ptrace sci_ptrace.c

Run it:

 ./sci_ptrace cat /etc/passwd cat: /etc/passwd: No such file or directory

Requirements: none (the kernel must provide ptrace)

Pros: it works

Cons: Slow, many "addresses" are processor architeture dependent, the interface is not clean (some signals cannot be used, SIGSTOP/SIGCONT, it overrides the natural semantics of the wait system call).

kmview.ko (based on utrace)

#define _GNU_SOURCE#include  <sys/types.h>#include  <sys/wait.h>#include  <unistd.h>#include  <stdio.h>#include  <stdlib.h>#include  <limits.h>#include  <errno.h>#include  <fcntl.h>#include  <string.h>#include  <asm/unistd.h>#include <sys/ioctl.h>#include <kmview.h>void dowait(int signal){  int w;  wait(&w);}#ifdef OPT_PATH_HASHstatic int hash(char *s){    int rv=0;      while (*s) {            rv ^= (rv << 5) + (rv >> 2) + *s;                s++;                  }        return rv;}#endifmain(int argc, char *argv[]){  int fd;  struct kmview_event event;  int flags=0;#ifdef OPT_OPEN_ONLY  int bitmap[INT_PER_MAXSYSCALL];#endif#ifdef OPT_PATH_HASH  struct ghosthash64 gh;#endif  fd=open("/dev/kmview",O_RDONLY);  if (fd <0)    exit(1);#ifdef OPT_OPEN_ONLY  scbitmap_fill(bitmap);  scbitmap_clr(bitmap, __NR_open);  ioctl(fd, KMVIEW_SYSCALLBITMAP,bitmap);#endif#ifdef OPT_PATH_HASH  flags|=KMVIEW_FLAG_PATH_SYSCALL_SKIP;  gh.deltalen[0]=strlen("/etc/passwd");  gh.hash[0] = hash("/etc/passwd");  gh.deltalen[1]=GH_TERMINATE;  ioctl(fd,KMVIEW_GHOSTMOUNTS,&gh);#endif#ifdef OPT_FDSET  flags|=KMVIEW_FLAG_FDSET;#endif  ioctl(fd, KMVIEW_SET_FLAGS, flags);  signal(SIGCHLD,dowait);  if (fork()) {    while (1) {      read(fd,&event,sizeof(event));      switch (event.tag) {        case KMVIEW_EVENT_NEWTHREAD:          {            struct kmview_ioctl_umpid ump;            ump.kmpid=event.x.newthread.kmpid;            ump.umpid=event.x.newthread.kmpid;            ioctl(fd, KMVIEW_UMPID, &ump);            break;          }        case KMVIEW_EVENT_TERMTHREAD:          if (event.x.termthread.remaining == 0)            exit (0);          break;        case KMVIEW_EVENT_SYSCALL_ENTRY:          if (event.x.syscall.scno == __NR_open) {            char path[PATH_MAX];            struct kmview_ioctl_data data={event.x.syscall.x.umpid,              event.x.syscall.args[0],PATH_MAX,path};            ioctl(fd,KMVIEW_READSTRINGDATA, &data);            if (strcmp(path,"/etc/passwd") == 0) {              struct kmview_event_ioctl_sysreturn outevent;              outevent.x.kmpid=event.x.syscall.x.umpid;              outevent.retval=-1;              outevent.erno = ENOENT;              ioctl(fd,KMVIEW_SYSVIRTUALIZED, &outevent);            } else              ioctl(fd, KMVIEW_SYSRESUME, event.x.syscall.x.umpid);          } else            ioctl(fd, KMVIEW_SYSRESUME, event.x.syscall.x.umpid);          break;      }    }  } else { /* traced root process*/    ioctl(fd, KMVIEW_ATTACH);    close(fd);    argv++;    execvp(argv[0],argv);  }}

Compile the source code (sci_kmview.c)

 gcc -o sci_kmview sci_kmview.c

Run it:

 ./sci_kmview cat /etc/passwd cat: /etc/passwd: No such file or directory

The code include several optimizations:

  • OPT_OPEN_ONLY: the kernel module filters only the "open" system calls
  • OPT_PATH_HASH: when a system calls uses a path, kmview.ko forward only those whose path matches a hash key
  • OPT_FDSET: kmview.ko manages a table of the "virtualized" file descriptors

Optimizations can be added at compile time using a combination of -DOPT_OPEN_ONLY, -DOPT_PATH_HASH and -DOPT_FDSET.

Requirements: the kernel must support utrace and the kmview.ko kernel module must be loaded

Pros: fast, several optimizations can run in kernel space, clean design (event can be read from a device),architecture independent.

Cons: utrace is not a feature of the vanilla Linux kernel

Basic Performance Evaluation

The benchmarking code is the following:

#include <stdio.h>#include <fcntl.h>main(){        int i;        int fd;        for (i=0; i<100000; i++) {                fd=open("/etc/passwd",O_RDONLY);                close(fd);                fd=open("/etc/hosts",O_RDONLY);                close(fd);        }}

The execution times are the following:

* kernel (not virtualized): 0.8sec* purelibc: 0.48sec* ptrace: ~37.5sec* kmview.ko (no opt): ~22sec* kmview.ko (opt): ~7.1sec

(purelibc virtualization is even faster than the non virtualized case because it generates less system calls)

Please note that this example has been designed to provide almost the worst case for the virtualizing service.The implementation based on kmview creates a minimal overhead when tested in a more common scenario (e.g. a compilation),

$ time gcc -o test test.creal    0m0.147suser    0m0.084ssys     0m0.044s$ time ./sci_kmview gcc -o test test.creal    0m0.146suser    0m0.088ssys     0m0.048s
0 0