Source: arch/i386/kernel/setup.c
The call graph for this function is shown in Figure 2.3. This function gets the necessary information to give to the boot memory allocator to initialise itself. It is broken up into a number of different tasks.
991 static unsigned long __init setup_memory(void)
992 {
993       unsigned long bootmap_size, start_pfn, max_low_pfn;
994 
995       /*
996        * partially used pages are not usable - thus
997        * we are rounding upwards:
998        */
999       start_pfn = PFN_UP(__pa(&_end));
1000 
1001      find_max_pfn();
1002 
1003      max_low_pfn = find_max_low_pfn();
1004 
1005 #ifdef CONFIG_HIGHMEM
1006      highstart_pfn = highend_pfn = max_pfn;
1007      if (max_pfn > max_low_pfn) {
1008            highstart_pfn = max_low_pfn;
1009      }
1010      printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
1011            pages_to_mb(highend_pfn - highstart_pfn));
1012 #endif
1013      printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
1014                  pages_to_mb(max_low_pfn));
1018 bootmap_size = init_bootmem(start_pfn, max_low_pfn); 1019 1020 register_bootmem_low_pages(max_low_pfn); 1021 1028 reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + 1029 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); 1030 1035 reserve_bootmem(0, PAGE_SIZE); 1036 1037 #ifdef CONFIG_SMP 1043 reserve_bootmem(PAGE_SIZE, PAGE_SIZE); 1044 #endif 1045 #ifdef CONFIG_ACPI_SLEEP 1046 /* 1047 * Reserve low memory region for sleep support. 1048 */ 1049 acpi_reserve_bootmem(); 1050 #endif
1051 #ifdef CONFIG_X86_LOCAL_APIC
1052       /*
1053        * Find and reserve possible boot-time SMP configuration:
1054        */
1055       find_smp_config();
1056 #endif
1057 #ifdef CONFIG_BLK_DEV_INITRD
1058       if (LOADER_TYPE && INITRD_START) {
1059             if (INITRD_START + INITRD_SIZE <= 
                    (max_low_pfn << PAGE_SHIFT)) {
1060                   reserve_bootmem(INITRD_START, INITRD_SIZE);
1061                   initrd_start =
1062                    INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
1063                   initrd_end = initrd_start+INITRD_SIZE;
1064             }
1065             else {
1066                   printk(KERN_ERR 
                           "initrd extends beyond end of memory "
1067                       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
1068                       INITRD_START + INITRD_SIZE,
1069                       max_low_pfn << PAGE_SHIFT);
1070                   initrd_start = 0;
1071             }
1072       }
1073 #endif
1074 
1075       return max_low_pfn;
1076 }
This is the top-level function which is used to initialise each of the zones. The size of the zones in PFNs was discovered during setup_memory() (See Section B.1.1). This function populates an array of zone sizes for passing to free_area_init().
323 static void __init zone_sizes_init(void)
324 {
325     unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
326     unsigned int max_dma, high, low;
327 
328     max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
329     low = max_low_pfn;
330     high = highend_pfn;
331 
332     if (low < max_dma)
333         zones_size[ZONE_DMA] = low;
334     else {
335         zones_size[ZONE_DMA] = max_dma;
336         zones_size[ZONE_NORMAL] = low - max_dma;
337 #ifdef CONFIG_HIGHMEM
338         zones_size[ZONE_HIGHMEM] = high - low;
339 #endif
340     }
341     free_area_init(zones_size);
342 }
This is the architecture independant function for setting up a UMA architecture. It simply calls the core function passing the static contig_page_data as the node. NUMA architectures will use free_area_init_node() instead.
838 void __init free_area_init(unsigned long *zones_size)
839 {
840     free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 
                            0, 0, 0);
841 }
There are two versions of this function. The first is almost identical to free_area_init() except it uses a different starting physical address. There is for architectures that have only one node (so they use contig_page_data) but whose physical address is not at 0.
This version of the function, called after the pagetable initialisation, if for initialisation each pgdat in the system. The caller has the option of allocating their own local portion of the mem_map and passing it in as a parameter if they want to optimise it's location for the architecture. If they choose not to, it will be allocated later by free_area_init_core().
 61 void __init free_area_init_node(int nid, 
        pg_data_t *pgdat, struct page *pmap,
 62     unsigned long *zones_size, unsigned long zone_start_paddr, 
 63     unsigned long *zholes_size)
 64 {
 65     int i, size = 0;
 66     struct page *discard;
 67 
 68     if (mem_map == (mem_map_t *)NULL)
 69         mem_map = (mem_map_t *)PAGE_OFFSET;
 70 
 71     free_area_init_core(nid, pgdat, &discard, zones_size, 
                        zone_start_paddr,
 72                     zholes_size, pmap);
 73     pgdat->node_id = nid;
 74 
 75     /*
 76      * Get space for the valid bitmap.
 77      */
 78     for (i = 0; i < MAX_NR_ZONES; i++)
 79         size += zones_size[i];
 80     size = LONG_ALIGN((size + 7) >> 3);
 81     pgdat->valid_addr_bitmap = 
                     (unsigned long *)alloc_bootmem_node(pgdat, size);
 82     memset(pgdat->valid_addr_bitmap, 0, size);
 83 }
This function is responsible for initialising all zones and allocating their local lmem_map within a node. In UMA architectures, this function is called in a way that will initialise the global mem_map array. In NUMA architectures, the array is treated as a virtual array that is sparsely populated.
684 void __init free_area_init_core(int nid, 
        pg_data_t *pgdat, struct page **gmap,
685     unsigned long *zones_size, unsigned long zone_start_paddr, 
686     unsigned long *zholes_size, struct page *lmem_map)
687 {
688     unsigned long i, j;
689     unsigned long map_size;
690     unsigned long totalpages, offset, realtotalpages;
691     const unsigned long zone_required_alignment = 
                                              1UL << (MAX_ORDER-1);
692 
693     if (zone_start_paddr & ~PAGE_MASK)
694         BUG();
695 
696     totalpages = 0;
697     for (i = 0; i < MAX_NR_ZONES; i++) {
698         unsigned long size = zones_size[i];
699         totalpages += size;
700     }
701     realtotalpages = totalpages;
702     if (zholes_size)
703         for (i = 0; i < MAX_NR_ZONES; i++)
704             realtotalpages -= zholes_size[i];
705             
706     printk("On node %d totalpages: %lu\n", nid, realtotalpages);
This block is mainly responsible for calculating the size of each zone.
708     /*
709      * Some architectures (with lots of mem and discontinous memory
710      * maps) have to search for a good mem_map area:
711      * For discontigmem, the conceptual mem map array starts from 
712      * PAGE_OFFSET, we need to align the actual array onto a mem map 
713      * boundary, so that MAP_NR works.
714      */
715     map_size = (totalpages + 1)*sizeof(struct page);
716     if (lmem_map == (struct page *)0) {
717         lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
718         lmem_map = (struct page *)(PAGE_OFFSET + 
719             MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
720     }
721     *gmap = pgdat->node_mem_map = lmem_map;
722     pgdat->node_size = totalpages;
723     pgdat->node_start_paddr = zone_start_paddr;
724     pgdat->node_start_mapnr = (lmem_map - mem_map);
725     pgdat->nr_zones = 0;
726
727     offset = lmem_map - mem_map;    
This block allocates the local lmem_map if necessary and sets the gmap. In UMA architectures, gmap is actually mem_map and so this is where the memory for it is allocated
728     for (j = 0; j < MAX_NR_ZONES; j++) {
729         zone_t *zone = pgdat->node_zones + j;
730         unsigned long mask;
731         unsigned long size, realsize;
732 
733         zone_table[nid * MAX_NR_ZONES + j] = zone;
734         realsize = size = zones_size[j];
735         if (zholes_size)
736             realsize -= zholes_size[j];
737 
738         printk("zone(%lu): %lu pages.\n", j, size);
739         zone->size = size;
740         zone->name = zone_names[j];
741         zone->lock = SPIN_LOCK_UNLOCKED;
742         zone->zone_pgdat = pgdat;
743         zone->free_pages = 0;
744         zone->need_balance = 0;
745         if (!size)
746             continue;
This block starts a loop which initialises every zone_t within the node. The initialisation starts with the setting of the simplier fields that values already exist for.
752 zone->wait_table_size = wait_table_size(size); 753 zone->wait_table_shift = 754 BITS_PER_LONG - wait_table_bits(zone->wait_table_size); 755 zone->wait_table = (wait_queue_head_t *) 756 alloc_bootmem_node(pgdat, zone->wait_table_size 757 * sizeof(wait_queue_head_t)); 758 759 for(i = 0; i < zone->wait_table_size; ++i) 760 init_waitqueue_head(zone->wait_table + i);
Initialise the waitqueue for this zone. Processes waiting on pages in the zone use this hashed table to select a queue to wait on. This means that all processes waiting in a zone will not have to be woken when a page is unlocked, just a smaller subset.
762         pgdat->nr_zones = j+1;
763 
764         mask = (realsize / zone_balance_ratio[j]);
765         if (mask < zone_balance_min[j])
766             mask = zone_balance_min[j];
767         else if (mask > zone_balance_max[j])
768             mask = zone_balance_max[j];
769         zone->pages_min = mask;
770         zone->pages_low = mask*2;
771         zone->pages_high = mask*3;
772 
773         zone->zone_mem_map = mem_map + offset;
774         zone->zone_start_mapnr = offset;
775         zone->zone_start_paddr = zone_start_paddr;
776 
777         if ((zone_start_paddr >> PAGE_SHIFT) & 
                                          (zone_required_alignment-1))
778             printk("BUG: wrong zone alignment, it will crash\n");
779 
Calculate the watermarks for the zone and record the location of the zone. The watermarks are calculated as ratios of the zone size.
780         /*
781          * Initially all pages are reserved - free ones are freed
782          * up by free_all_bootmem() once the early boot process is
783          * done. Non-atomic initialization, single-pass.
784          */
785         for (i = 0; i < size; i++) {
786             struct page *page = mem_map + offset + i;
787             set_page_zone(page, nid * MAX_NR_ZONES + j);
788             set_page_count(page, 0);
789             SetPageReserved(page);
790             INIT_LIST_HEAD(&page->list);
791             if (j != ZONE_HIGHMEM)
792                 set_page_address(page, __va(zone_start_paddr));
793             zone_start_paddr += PAGE_SIZE;
794         }
795 
796         offset += size;
797         for (i = 0; ; i++) {
798             unsigned long bitmap_size;
799 
800             INIT_LIST_HEAD(&zone->free_area[i].free_list);
801             if (i == MAX_ORDER-1) {
802                 zone->free_area[i].map = NULL;
803                 break;
804             }
805 
829             bitmap_size = (size-1) >> (i+4);
830             bitmap_size = LONG_ALIGN(bitmap_size+1);
831             zone->free_area[i].map = 
832               (unsigned long *) alloc_bootmem_node(pgdat, 
                                                       bitmap_size);
833         }
834     }
835     build_zonelists(pgdat);
836 }
This block initialises the free lists for the zone and allocates the bitmap used by the buddy allocator to record the state of page buddies.
This builds the list of fallback zones for each zone in the requested node. This is for when an allocation cannot be satisified and another zone is consulted. When this is finished, allocatioons from ZONE_HIGHMEM will fallback to ZONE_NORMAL. Allocations from ZONE_NORMAL will fall back to ZONE_DMA which in turn has nothing to fall back on.
589 static inline void build_zonelists(pg_data_t *pgdat)
590 {
591     int i, j, k;
592 
593     for (i = 0; i <= GFP_ZONEMASK; i++) {
594         zonelist_t *zonelist;
595         zone_t *zone;
596 
597         zonelist = pgdat->node_zonelists + i;
598         memset(zonelist, 0, sizeof(*zonelist));
599 
600         j = 0;
601         k = ZONE_NORMAL;
602         if (i & __GFP_HIGHMEM)
603             k = ZONE_HIGHMEM;
604         if (i & __GFP_DMA)
605             k = ZONE_DMA;
606 
607         switch (k) {
608             default:
609                 BUG();
610             /*
611              * fallthrough:
612              */
613             case ZONE_HIGHMEM:
614                 zone = pgdat->node_zones + ZONE_HIGHMEM;
615                 if (zone->size) {
616 #ifndef CONFIG_HIGHMEM
617                     BUG();
618 #endif
619                     zonelist->zones[j++] = zone;
620                 }
621             case ZONE_NORMAL:
622                 zone = pgdat->node_zones + ZONE_NORMAL;
623                 if (zone->size)
624                     zonelist->zones[j++] = zone;
625             case ZONE_DMA:
626                 zone = pgdat->node_zones + ZONE_DMA;
627                 if (zone->size)
628                     zonelist->zones[j++] = zone;
629         }
630         zonelist->zones[j++] = NULL;
631     } 
632 }
| B.2 Page Operations | 216 | 
| B.2.1 Locking Pages | 216 | 
| B.2.1.1 Function: lock_page() | 216 | 
| B.2.1.2 Function: __lock_page() | 216 | 
| B.2.1.3 Function: sync_page() | 217 | 
| B.2.2 Unlocking Pages | 218 | 
| B.2.2.1 Function: unlock_page() | 218 | 
| B.2.3 Waiting on Pages | 219 | 
| B.2.3.1 Function: wait_on_page() | 219 | 
| B.2.3.2 Function: ___wait_on_page() | 219 | 
This function tries to lock a page. If the page cannot be locked, it will cause the process to sleep until the page is available.
921 void lock_page(struct page *page)
922 {
923     if (TryLockPage(page))
924         __lock_page(page);
925 }
This is called after a TryLockPage() failed. It will locate the waitqueue for this page and sleep on it until the lock can be acquired.
897 static void __lock_page(struct page *page)
898 {
899     wait_queue_head_t *waitqueue = page_waitqueue(page);
900     struct task_struct *tsk = current;
901     DECLARE_WAITQUEUE(wait, tsk);
902 
903     add_wait_queue_exclusive(waitqueue, &wait);
904     for (;;) {
905         set_task_state(tsk, TASK_UNINTERRUPTIBLE);
906         if (PageLocked(page)) {
907             sync_page(page);
908             schedule();
909         }
910         if (!TryLockPage(page))
911             break;
912     }
913     __set_task_state(tsk, TASK_RUNNING);
914     remove_wait_queue(waitqueue, &wait);
915 }
This calls the filesystem-specific sync_page() to synchronsise the page with it's backing storage.
140 static inline int sync_page(struct page *page)
141 {
142     struct address_space *mapping = page->mapping;
143 
144     if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
145         return mapping->a_ops->sync_page(page);
146     return 0;
147 }
This function unlocks a page and wakes up any processes that may be waiting on it.
874 void unlock_page(struct page *page)
875 {
876     wait_queue_head_t *waitqueue = page_waitqueue(page);
877     ClearPageLaunder(page);
878     smp_mb__before_clear_bit();
879     if (!test_and_clear_bit(PG_locked, &(page)->flags))
880         BUG();
881     smp_mb__after_clear_bit(); 
882 
883     /*
884      * Although the default semantics of wake_up() are
885      * to wake all, here the specific function is used
886      * to make it even more explicit that a number of
887      * pages are being waited on here.
888      */
889     if (waitqueue_active(waitqueue))
890         wake_up_all(waitqueue);
891 }
Source: include/linux/pagemap.h
 94 static inline void wait_on_page(struct page * page)
 95 {
 96     if (PageLocked(page))
 97         ___wait_on_page(page);
 98 }
This function is called after PageLocked() has been used to determine the page is locked. The calling process will probably sleep until the page is unlocked.
849 void ___wait_on_page(struct page *page)
850 {
851     wait_queue_head_t *waitqueue = page_waitqueue(page);
852     struct task_struct *tsk = current;
853     DECLARE_WAITQUEUE(wait, tsk);
854 
855     add_wait_queue(waitqueue, &wait);
856     do {
857         set_task_state(tsk, TASK_UNINTERRUPTIBLE);
858         if (!PageLocked(page))
859             break;
860         sync_page(page);
861         schedule();
862     } while (PageLocked(page));
863     __set_task_state(tsk, TASK_RUNNING);
864     remove_wait_queue(waitqueue, &wait);
865 }