From: Martin Schwidefsky This fixes a problem in sys_swapon that can cause the creation of invalid swap ptes. This has its cause in the arch-independent swap entries vs. the pte coded swap entries. The swp_entry_t uses 27 bits for the offset and 5 bits for the type. In sys_swapon this definition is used to find how many swap devices and how many pages on each device there can be. But the swap entries encoded in a pte can be subject to additional restrictions due to the hardware besides the 27/5 division of the bits in the swp_entry_t type. This is solved by adding pte_to_swp_entry and swp_entry_to_pte calls to the calculations for maximum type and offset. In addition the s390 swap pte division for offset/type is changed from 19/6 bits to 20/5 bits. --- 25-akpm/include/asm-s390/pgtable.h | 44 ++++++++++++++++--------------------- 25-akpm/mm/swapfile.c | 30 +++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 26 deletions(-) diff -puN include/asm-s390/pgtable.h~swp_entry-vs-swap_pte-fix include/asm-s390/pgtable.h --- 25/include/asm-s390/pgtable.h~swp_entry-vs-swap_pte-fix 2004-03-25 08:58:23.839734168 -0800 +++ 25-akpm/include/asm-s390/pgtable.h 2004-03-25 08:58:23.843733560 -0800 @@ -719,14 +719,14 @@ extern inline pmd_t * pmd_offset(pgd_t * * information in the lowcore. * Bit 21 and bit 22 are the page invalid bit and the page protection * bit. We set both to indicate a swapped page. - * Bit 31 is used as the software page present bit. If a page is - * swapped this obviously has to be zero. - * This leaves the bits 1-19 and bits 24-30 to store type and offset. - * We use the 7 bits from 24-30 for the type and the 19 bits from 1-19 - * for the offset. - * 0| offset |0110|type |0 - * 00000000001111111111222222222233 - * 01234567890123456789012345678901 + * Bit 30 and 31 are used to distinguish the different page types. For + * a swapped page these bits need to be zero. + * This leaves the bits 1-19 and bits 24-29 to store type and offset. + * We use the 5 bits from 25-29 for the type and the 20 bits from 1-19 + * plus 24 for the offset. + * 0| offset |0110|o|type |00| + * 0 0000000001111111111 2222 2 22222 33 + * 0 1234567890123456789 0123 4 56789 01 * * 64 bit swap entry format: * A page-table entry has some bits we have to treat in a special way. @@ -736,29 +736,25 @@ extern inline pmd_t * pmd_offset(pgd_t * * information in the lowcore. * Bit 53 and bit 54 are the page invalid bit and the page protection * bit. We set both to indicate a swapped page. - * Bit 63 is used as the software page present bit. If a page is - * swapped this obviously has to be zero. - * This leaves the bits 0-51 and bits 56-62 to store type and offset. - * We use the 7 bits from 56-62 for the type and the 52 bits from 0-51 - * for the offset. - * | offset |0110|type |0 - * 0000000000111111111122222222223333333333444444444455555555556666 - * 0123456789012345678901234567890123456789012345678901234567890123 + * Bit 62 and 63 are used to distinguish the different page types. For + * a swapped page these bits need to be zero. + * This leaves the bits 0-51 and bits 56-61 to store type and offset. + * We use the 5 bits from 57-61 for the type and the 53 bits from 0-51 + * plus 56 for the offset. + * | offset |0110|o|type |00| + * 0000000000111111111122222222223333333333444444444455 5555 5 55566 66 + * 0123456789012345678901234567890123456789012345678901 2345 6 78901 23 */ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) { pte_t pte; - pte_val(pte) = (type << 1) | (offset << 12) | _PAGE_INVALID_SWAP; -#ifndef __s390x__ - BUG_ON((pte_val(pte) & 0x80000901) != 0); -#else /* __s390x__ */ - BUG_ON((pte_val(pte) & 0x901) != 0); -#endif /* __s390x__ */ + pte_val(pte) = _PAGE_INVALID_SWAP | ((type & 0x1f) << 2) | + ((offset & 1) << 7) | ((offset & 0xffffe) << 11); return pte; } -#define __swp_type(entry) (((entry).val >> 1) & 0x3f) -#define __swp_offset(entry) ((entry).val >> 12) +#define __swp_type(entry) (((entry).val >> 2) & 0x1f) +#define __swp_offset(entry) (((entry).val >> 11) | (((entry).val >> 7) & 1)) #define __swp_entry(type,offset) ((swp_entry_t) { pte_val(mk_swap_pte((type),(offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) diff -puN mm/swapfile.c~swp_entry-vs-swap_pte-fix mm/swapfile.c --- 25/mm/swapfile.c~swp_entry-vs-swap_pte-fix 2004-03-25 08:58:23.840734016 -0800 +++ 25-akpm/mm/swapfile.c 2004-03-25 08:58:23.845733256 -0800 @@ -1302,7 +1302,19 @@ asmlinkage long sys_swapon(const char __ if (!(p->flags & SWP_USED)) break; error = -EPERM; - if (type >= MAX_SWAPFILES) { + /* + * Test if adding another swap device is possible. There are + * two limiting factors: 1) the number of bits for the swap + * type swp_entry_t definition and 2) the number of bits for + * the swap type in the swap ptes as defined by the different + * architectures. To honor both limitations a swap entry + * with swap offset 0 and swap type ~0UL is created, encoded + * to a swap pte, decoded to a swp_entry_t again and finally + * the swap type part is extracted. This will mask all bits + * from the initial ~0UL that can't be encoded in either the + * swp_entry_t or the architecture definition of a swap pte. + */ + if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { swap_list_unlock(); goto out; } @@ -1424,7 +1436,21 @@ asmlinkage long sys_swapon(const char __ } p->lowest_bit = 1; - maxpages = swp_offset(swp_entry(0,~0UL)) - 1; + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; if (maxpages > swap_header->info.last_page) maxpages = swap_header->info.last_page; p->highest_bit = maxpages - 1; _