#include #include #include #include #include #include #include #include #include #include #include #define MY_NAME "lowmem" #define LOWMEM_MAX_UIDS 8 enum { VM_LOWMEM_DENY = 1, VM_LOWMEM_LEVEL1_NOTIFY, VM_LOWMEM_LEVEL2_NOTIFY, VM_LOWMEM_NR_DECAY_PAGES, VM_LOWMEM_ALLOWED_UIDS, VM_LOWMEM_ALLOWED_PAGES, VM_LOWMEM_USED_PAGES, }; static unsigned int deny_percentage; static unsigned int l1_notify, l2_notify; static unsigned int nr_decay_pages; static unsigned long allowed_pages; static long used_pages; static unsigned int allowed_uids[LOWMEM_MAX_UIDS]; static unsigned int minuid = 1; static unsigned int maxuid = 65535; static ctl_table lowmem_table[] = { { .ctl_name = VM_LOWMEM_DENY, .procname = "lowmem_deny_watermark", .data = &deny_percentage, .maxlen = sizeof(unsigned int), .mode = 0644, .child = NULL, .proc_handler = &proc_dointvec, .strategy = &sysctl_intvec, }, { .ctl_name = VM_LOWMEM_LEVEL1_NOTIFY, .procname = "lowmem_notify_low", .data = &l1_notify, .maxlen = sizeof(unsigned int), .mode = 0644, .child = NULL, .proc_handler = &proc_dointvec, .strategy = &sysctl_intvec, }, { .ctl_name = VM_LOWMEM_LEVEL2_NOTIFY, .procname = "lowmem_notify_high", .data = &l2_notify, .maxlen = sizeof(unsigned int), .mode = 0644, .child = NULL, .proc_handler = &proc_dointvec, .strategy = &sysctl_intvec, }, { .ctl_name = VM_LOWMEM_NR_DECAY_PAGES, .procname = "lowmem_nr_decay_pages", .data = &nr_decay_pages, .maxlen = sizeof(unsigned int), .mode = 0644, .child = NULL, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, }, { .ctl_name = VM_LOWMEM_ALLOWED_UIDS, .procname = "lowmem_allowed_uids", .data = &allowed_uids, .maxlen = LOWMEM_MAX_UIDS * sizeof(unsigned int), .mode = 0644, .child = NULL, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &minuid, .extra2 = &maxuid, }, { .ctl_name = VM_LOWMEM_ALLOWED_PAGES, .procname = "lowmem_allowed_pages", .data = &allowed_pages, .maxlen = sizeof(unsigned long), .mode = 0444, .child = NULL, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, }, { .ctl_name = VM_LOWMEM_USED_PAGES, .procname = "lowmem_used_pages", .data = &used_pages, .maxlen = sizeof(long), .mode = 0444, .child = NULL, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, }, { .ctl_name = 0 } }; static ctl_table lowmem_root_table[] = { { .ctl_name = CTL_VM, .procname = "vm", .mode = 0555, .child = lowmem_table, }, { .ctl_name = 0 } }; #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) static int low_watermark_reached, high_watermark_reached; static ssize_t low_watermark_show(struct subsystem *subsys, char *page) { return sprintf(page, "%u\n", low_watermark_reached); } static ssize_t high_watermark_show(struct subsystem *subsys, char *page) { return sprintf(page, "%u\n", high_watermark_reached); } KERNEL_ATTR_RO(low_watermark); KERNEL_ATTR_RO(high_watermark); static void low_watermark_state(int new_state) { int changed = 0; if (low_watermark_reached != new_state) { low_watermark_reached = new_state; changed = 1; } if (changed) sysfs_notify(&kernel_subsys.kset.kobj, NULL, "low_watermark"); } static void high_watermark_state(int new_state) { int changed = 0; if (high_watermark_reached != new_state) { high_watermark_reached = new_state; changed = 1; } if (changed) sysfs_notify(&kernel_subsys.kset.kobj, NULL, "high_watermark"); } static int low_vm_enough_memory(long pages) { unsigned long free, allowed; long deny_threshold, level1, level2, used; int cap_sys_admin = 0, notify; if (cap_capable(current, CAP_SYS_ADMIN) == 0) cap_sys_admin = 1; /* We activate ourselves only after both parameters have been * configured. */ if (deny_percentage == 0 || l1_notify == 0 || l2_notify == 0) return __vm_enough_memory(pages, cap_sys_admin); allowed = totalram_pages - hugetlb_total_pages(); deny_threshold = allowed * deny_percentage / 100; level1 = allowed * l1_notify / 100; level2 = allowed * l2_notify / 100; vm_acct_memory(pages); /* Easily freed pages when under VM pressure or direct reclaim */ free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; free += global_page_state(NR_SLAB_RECLAIMABLE); used = allowed - free; if (unlikely(used < 0)) used = 0; /* The hot path, plenty of memory */ if (likely(used < level1)) goto enough_memory; /* No luck, lets make it more expensive and try again.. */ used -= nr_free_pages(); if (used >= deny_threshold) { int i; allowed_pages = allowed; used_pages = used; low_watermark_state(1); high_watermark_state(1); /* Memory allocations by root are always allowed */ if (cap_sys_admin) return 0; /* uids from allowed_uids vector are also allowed no matter what */ for (i = 0; i < LOWMEM_MAX_UIDS && allowed_uids[i]; i++) if (current->uid == allowed_uids[i]) return 0; vm_unacct_memory(pages); if (printk_ratelimit()) { printk(MY_NAME ": denying memory allocation to process %d (%s)\n", current->pid, current->comm); } return -ENOMEM; } enough_memory: /* See if we need to notify level 1 */ low_watermark_state(used >= level1); /* * In the level 2 notification case things are more complicated, * as the level that we drop the state and send a notification * should be lower than when it is first triggered. Having this * on the same watermark level ends up bouncing back and forth * when applications are being stupid. */ notify = used >= level2; if (notify || used + nr_decay_pages < level2) high_watermark_state(notify); /* We have plenty of memory */ allowed_pages = allowed; used_pages = used; return 0; } static struct security_operations lowmem_security_ops = { /* Use the capability functions for some of the hooks */ .ptrace = cap_ptrace, .capget = cap_capget, .capset_check = cap_capset_check, .capset_set = cap_capset_set, .capable = cap_capable, .bprm_apply_creds = cap_bprm_apply_creds, .bprm_set_security = cap_bprm_set_security, .task_post_setuid = cap_task_post_setuid, .task_reparent_to_init = cap_task_reparent_to_init, .vm_enough_memory = low_vm_enough_memory, }; static struct ctl_table_header *lowmem_table_header; /* flag to keep track of how we were registered */ static int secondary; static struct attribute *lowmem_attrs[] = { &low_watermark_attr.attr, &high_watermark_attr.attr, NULL, }; static struct attribute_group lowmem_attr_group = { .attrs = lowmem_attrs, }; static int __init lowmem_init(void) { int r; /* register ourselves with the security framework */ if (register_security(&lowmem_security_ops)) { printk(KERN_ERR MY_NAME ": Failure registering with the kernel\n"); /* try registering with primary module */ if (mod_reg_security(MY_NAME, &lowmem_security_ops)) { printk(KERN_ERR ": Failure registering with the primary" "security module.\n"); return -EINVAL; } secondary = 1; } /* initialize the uids vector */ memset(allowed_uids, 0, sizeof(allowed_uids)); lowmem_table_header = register_sysctl_table(lowmem_root_table); if (unlikely(!lowmem_table_header)) return -EPERM; kernel_subsys.kset.kobj.kset = &kernel_subsys.kset; r = sysfs_create_group(&kernel_subsys.kset.kobj, &lowmem_attr_group); if (unlikely(r)) return r; printk(KERN_INFO MY_NAME ": Module initialized.\n"); return 0; } static void __exit lowmem_exit(void) { /* remove ourselves from the security framework */ if (secondary) { if (mod_unreg_security(MY_NAME, &lowmem_security_ops)) printk(KERN_ERR MY_NAME ": Failure unregistering " "with the primary security module.\n"); } else { if (unregister_security(&lowmem_security_ops)) { printk(KERN_ERR MY_NAME ": Failure unregistering " "with the kernel.\n"); } } unregister_sysctl_table(lowmem_table_header); sysfs_remove_group(&kernel_subsys.kset.kobj, &lowmem_attr_group); printk(KERN_INFO MY_NAME ": Module removed.\n"); } module_init(lowmem_init); module_exit(lowmem_exit); MODULE_DESCRIPTION("Low watermark LSM module"); MODULE_LICENSE("GPL");