1111#include <linux/swap.h>
1212#include <linux/swapops.h>
1313#include <linux/pagemap.h>
14+ #include <linux/pagevec.h>
1415#include <linux/mempolicy.h>
1516#include <linux/syscalls.h>
1617#include <linux/sched.h>
1718#include <linux/export.h>
1819#include <linux/rmap.h>
1920#include <linux/mmzone.h>
2021#include <linux/hugetlb.h>
22+ #include <linux/memcontrol.h>
23+ #include <linux/mm_inline.h>
2124
2225#include "internal.h"
2326
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
8790 }
8891}
8992
93+ /*
94+ * Finish munlock after successful page isolation
95+ *
96+ * Page must be locked. This is a wrapper for try_to_munlock()
97+ * and putback_lru_page() with munlock accounting.
98+ */
99+ static void __munlock_isolated_page (struct page * page )
100+ {
101+ int ret = SWAP_AGAIN ;
102+
103+ /*
104+ * Optimization: if the page was mapped just once, that's our mapping
105+ * and we don't need to check all the other vmas.
106+ */
107+ if (page_mapcount (page ) > 1 )
108+ ret = try_to_munlock (page );
109+
110+ /* Did try_to_unlock() succeed or punt? */
111+ if (ret != SWAP_MLOCK )
112+ count_vm_event (UNEVICTABLE_PGMUNLOCKED );
113+
114+ putback_lru_page (page );
115+ }
116+
117+ /*
118+ * Accounting for page isolation fail during munlock
119+ *
120+ * Performs accounting when page isolation fails in munlock. There is nothing
121+ * else to do because it means some other task has already removed the page
122+ * from the LRU. putback_lru_page() will take care of removing the page from
123+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
124+ * the page back to the unevictable list if some other vma has it mlocked.
125+ */
126+ static void __munlock_isolation_failed (struct page * page )
127+ {
128+ if (PageUnevictable (page ))
129+ count_vm_event (UNEVICTABLE_PGSTRANDED );
130+ else
131+ count_vm_event (UNEVICTABLE_PGMUNLOCKED );
132+ }
133+
90134/**
91135 * munlock_vma_page - munlock a vma page
92136 * @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
112156 unsigned int nr_pages = hpage_nr_pages (page );
113157 mod_zone_page_state (page_zone (page ), NR_MLOCK , - nr_pages );
114158 page_mask = nr_pages - 1 ;
115- if (!isolate_lru_page (page )) {
116- int ret = SWAP_AGAIN ;
117-
118- /*
119- * Optimization: if the page was mapped just once,
120- * that's our mapping and we don't need to check all the
121- * other vmas.
122- */
123- if (page_mapcount (page ) > 1 )
124- ret = try_to_munlock (page );
125- /*
126- * did try_to_unlock() succeed or punt?
127- */
128- if (ret != SWAP_MLOCK )
129- count_vm_event (UNEVICTABLE_PGMUNLOCKED );
130-
131- putback_lru_page (page );
132- } else {
133- /*
134- * Some other task has removed the page from the LRU.
135- * putback_lru_page() will take care of removing the
136- * page from the unevictable list, if necessary.
137- * vmscan [page_referenced()] will move the page back
138- * to the unevictable list if some other vma has it
139- * mlocked.
140- */
141- if (PageUnevictable (page ))
142- count_vm_event (UNEVICTABLE_PGSTRANDED );
143- else
144- count_vm_event (UNEVICTABLE_PGMUNLOCKED );
145- }
159+ if (!isolate_lru_page (page ))
160+ __munlock_isolated_page (page );
161+ else
162+ __munlock_isolation_failed (page );
146163 }
147164
148165 return page_mask ;
@@ -209,6 +226,73 @@ static int __mlock_posix_error_return(long retval)
209226 return retval ;
210227}
211228
229+ /*
230+ * Munlock a batch of pages from the same zone
231+ *
232+ * The work is split to two main phases. First phase clears the Mlocked flag
233+ * and attempts to isolate the pages, all under a single zone lru lock.
234+ * The second phase finishes the munlock only for pages where isolation
235+ * succeeded.
236+ *
237+ * Note that pvec is modified during the process. Before returning
238+ * pagevec_reinit() is called on it.
239+ */
240+ static void __munlock_pagevec (struct pagevec * pvec , struct zone * zone )
241+ {
242+ int i ;
243+ int nr = pagevec_count (pvec );
244+
245+ /* Phase 1: page isolation */
246+ spin_lock_irq (& zone -> lru_lock );
247+ for (i = 0 ; i < nr ; i ++ ) {
248+ struct page * page = pvec -> pages [i ];
249+
250+ if (TestClearPageMlocked (page )) {
251+ struct lruvec * lruvec ;
252+ int lru ;
253+
254+ /* we have disabled interrupts */
255+ __mod_zone_page_state (zone , NR_MLOCK , -1 );
256+
257+ if (PageLRU (page )) {
258+ lruvec = mem_cgroup_page_lruvec (page , zone );
259+ lru = page_lru (page );
260+
261+ get_page (page );
262+ ClearPageLRU (page );
263+ del_page_from_lru_list (page , lruvec , lru );
264+ } else {
265+ __munlock_isolation_failed (page );
266+ goto skip_munlock ;
267+ }
268+
269+ } else {
270+ skip_munlock :
271+ /*
272+ * We won't be munlocking this page in the next phase
273+ * but we still need to release the follow_page_mask()
274+ * pin.
275+ */
276+ pvec -> pages [i ] = NULL ;
277+ put_page (page );
278+ }
279+ }
280+ spin_unlock_irq (& zone -> lru_lock );
281+
282+ /* Phase 2: page munlock and putback */
283+ for (i = 0 ; i < nr ; i ++ ) {
284+ struct page * page = pvec -> pages [i ];
285+
286+ if (page ) {
287+ lock_page (page );
288+ __munlock_isolated_page (page );
289+ unlock_page (page );
290+ put_page (page ); /* pin from follow_page_mask() */
291+ }
292+ }
293+ pagevec_reinit (pvec );
294+ }
295+
212296/*
213297 * munlock_vma_pages_range() - munlock all pages in the vma range.'
214298 * @vma - vma containing range to be munlock()ed.
@@ -230,11 +314,16 @@ static int __mlock_posix_error_return(long retval)
230314void munlock_vma_pages_range (struct vm_area_struct * vma ,
231315 unsigned long start , unsigned long end )
232316{
317+ struct pagevec pvec ;
318+ struct zone * zone = NULL ;
319+
320+ pagevec_init (& pvec , 0 );
233321 vma -> vm_flags &= ~VM_LOCKED ;
234322
235323 while (start < end ) {
236324 struct page * page ;
237325 unsigned int page_mask , page_increm ;
326+ struct zone * pagezone ;
238327
239328 /*
240329 * Although FOLL_DUMP is intended for get_dump_page(),
@@ -246,20 +335,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
246335 page = follow_page_mask (vma , start , FOLL_GET | FOLL_DUMP ,
247336 & page_mask );
248337 if (page && !IS_ERR (page )) {
249- lock_page (page );
250- /*
251- * Any THP page found by follow_page_mask() may have
252- * gotten split before reaching munlock_vma_page(),
253- * so we need to recompute the page_mask here.
254- */
255- page_mask = munlock_vma_page (page );
256- unlock_page (page );
257- put_page (page );
338+ pagezone = page_zone (page );
339+ /* The whole pagevec must be in the same zone */
340+ if (pagezone != zone ) {
341+ if (pagevec_count (& pvec ))
342+ __munlock_pagevec (& pvec , zone );
343+ zone = pagezone ;
344+ }
345+ if (PageTransHuge (page )) {
346+ /*
347+ * THP pages are not handled by pagevec due
348+ * to their possible split (see below).
349+ */
350+ if (pagevec_count (& pvec ))
351+ __munlock_pagevec (& pvec , zone );
352+ lock_page (page );
353+ /*
354+ * Any THP page found by follow_page_mask() may
355+ * have gotten split before reaching
356+ * munlock_vma_page(), so we need to recompute
357+ * the page_mask here.
358+ */
359+ page_mask = munlock_vma_page (page );
360+ unlock_page (page );
361+ put_page (page ); /* follow_page_mask() */
362+ } else {
363+ /*
364+ * Non-huge pages are handled in batches
365+ * via pagevec. The pin from
366+ * follow_page_mask() prevents them from
367+ * collapsing by THP.
368+ */
369+ if (pagevec_add (& pvec , page ) == 0 )
370+ __munlock_pagevec (& pvec , zone );
371+ }
258372 }
259373 page_increm = 1 + (~(start >> PAGE_SHIFT ) & page_mask );
260374 start += page_increm * PAGE_SIZE ;
261375 cond_resched ();
262376 }
377+ if (pagevec_count (& pvec ))
378+ __munlock_pagevec (& pvec , zone );
263379}
264380
265381/*
0 commit comments