Isaku Yamahata
2009-Apr-28  03:01 UTC
[Xen-ia64-devel] [PATCH 4/4] linux/blktap: fix blktap struct page* array.
linux/blktap: fix vma_close() for partial munmap.
vm_area_struct::vm_private_data is used
by get_user_pages() so that we can''t override
it. So in order to make blktap work, set it
to a array of struct page*.
Without mm->mmap_sem, virtual mapping can be changed.
so remembering vma which was passed to mmap callback
is bogus because later the vma can be freed or changed.
So don''t remember vma and put necessary infomations into
tap_blkif_t. and use find_vma() to get necessary vma''s.
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
diff --git a/drivers/xen/blktap/blktap.c b/drivers/xen/blktap/blktap.c
--- a/drivers/xen/blktap/blktap.c
+++ b/drivers/xen/blktap/blktap.c
@@ -99,7 +99,7 @@ typedef struct domid_translate_ext {
 
 /*Data struct associated with each of the tapdisk devices*/
 typedef struct tap_blkif {
-	struct vm_area_struct *vma;   /*Shared memory area                   */
+	struct mm_struct *mm;         /*User address space                   */
 	unsigned long rings_vstart;   /*Kernel memory mapping                */
 	unsigned long user_vstart;    /*User memory mapping                  */
 	unsigned long dev_inuse;      /*One process opens device at a time.  */
@@ -116,6 +116,7 @@ typedef struct tap_blkif {
 					[req id, idx] tuple                  */
 	blkif_t *blkif;               /*Associate blkif with tapdev          */
 	struct domid_translate_ext trans; /*Translation from domid to bus.   */
+	struct page **map;	      /*Mapping page */
 } tap_blkif_t;
 
 static struct tap_blkif *tapfds[MAX_TAP_DEV];
@@ -293,10 +294,6 @@ static inline int OFFSET_TO_SEG(int offs
 /******************************************************************
  * BLKTAP VM OPS
  */
-struct tap_vma_priv {
-	tap_blkif_t *info;
-	struct page *map[];
-};
 
 static struct page *blktap_nopage(struct vm_area_struct *vma,
 				  unsigned long address,
@@ -315,11 +312,10 @@ static pte_t blktap_clear_pte(struct vm_
 			      pte_t *ptep, int is_fullmm)
 {
 	pte_t copy;
-	tap_blkif_t *info;
+	tap_blkif_t *info = NULL;
 	int offset, seg, usr_idx, pending_idx, mmap_idx;
 	unsigned long uvstart;
 	unsigned long kvaddr;
-	struct tap_vma_priv *priv;
 	struct page *pg;
 	struct grant_handle_pair *khandle;
 	struct gnttab_unmap_grant_ref unmap[2];
@@ -338,12 +334,9 @@ static pte_t blktap_clear_pte(struct vm_
 		return ptep_get_and_clear_full(vma->vm_mm, uvaddr, 
 					       ptep, is_fullmm);
 
-	priv = vma->vm_private_data;
-
 	/* TODO Should these be changed to if statements? */
 	BUG_ON(!info);
 	BUG_ON(!info->idx_map);
-	BUG_ON(!priv);
 
 	offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
 	usr_idx = OFFSET_TO_USR_IDX(offset);
@@ -355,7 +348,7 @@ static pte_t blktap_clear_pte(struct vm_
 	kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
 	pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
 	ClearPageReserved(pg);
-	priv->map[offset + RING_PAGES] = NULL;
+	info->map[offset + RING_PAGES] = NULL;
 
 	khandle = &pending_handle(mmap_idx, pending_idx, seg);
 
@@ -396,19 +389,43 @@ static pte_t blktap_clear_pte(struct vm_
 	return copy;
 }
 
+static void blktap_vma_open(struct vm_area_struct *vma)
+{
+	tap_blkif_t *info;
+	if (vma->vm_file == NULL)
+		return;
+
+	info = vma->vm_file->private_data;
+	vma->vm_private_data +		&info->map[(vma->vm_start -
info->rings_vstart) >> PAGE_SHIFT];
+}
+
+/* tricky part
+ * When partial munmapping, ->open() is called only splitted vma which
+ * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
+ * So there is no chance to fix up vm_private_data of the end vma.
+ */
 static void blktap_vma_close(struct vm_area_struct *vma)
 {
-	struct tap_vma_priv *priv = vma->vm_private_data;
+	tap_blkif_t *info;
+	struct vm_area_struct *next = vma->vm_next;
 
-	if (priv) {
-		priv->info->vma = NULL;
-		kfree(priv);
-	}
+	if (next == NULL ||
+	    vma->vm_ops != next->vm_ops ||
+	    vma->vm_end != next->vm_start ||
+	    vma->vm_file == NULL ||
+	    vma->vm_file != next->vm_file)
+		return;
+
+	info = vma->vm_file->private_data;
+	next->vm_private_data +		&info->map[(next->vm_start -
info->rings_vstart) >> PAGE_SHIFT];
 }
 
-struct vm_operations_struct blktap_vm_ops = {
+static struct vm_operations_struct blktap_vm_ops = {
 	nopage:   blktap_nopage,
 	zap_pte:  blktap_clear_pte,
+	open:     blktap_vma_open,
 	close:    blktap_vma_close,
 };
 
@@ -455,7 +472,7 @@ static tap_blkif_t *get_next_free_dev(vo
 		info = tapfds[minor];
 		/* we could have failed a previous attempt. */
 		if (!info ||
-		    ((info->dev_inuse == 0) &&
+		    ((!test_bit(0, &info->dev_inuse)) &&
 		     (info->dev_pending == 0)) ) {
 			info->dev_pending = 1;
 			goto found;
@@ -592,7 +609,7 @@ static int blktap_open(struct inode *ino
 	FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
 	
 	filp->private_data = info;
-	info->vma = NULL;
+	info->mm = NULL;
 
 	info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
 				GFP_KERNEL);
@@ -624,8 +641,10 @@ static int blktap_release(struct inode *
 	info->ring_ok = 0;
 	smp_wmb();
 
-	info->dev_inuse = 0;
-	DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
+	mmput(info->mm);
+	info->mm = NULL;
+	kfree(info->map);
+	info->map = NULL;
 
 	/* Free the ring page. */
 	ClearPageReserved(virt_to_page(info->ufe_ring.sring));
@@ -644,6 +663,9 @@ static int blktap_release(struct inode *
 		info->status = CLEANSHUTDOWN;
 	}
 
+	clear_bit(0, &info->dev_inuse);
+	DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
+
 	return 0;
 }
 
@@ -669,7 +691,6 @@ static int blktap_release(struct inode *
 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int size;
-	struct tap_vma_priv *priv;
 	tap_blkif_t *info = filp->private_data;
 	int ret;
 
@@ -706,16 +727,14 @@ static int blktap_mmap(struct file *filp
 	}
 
 	/* Mark this VM as containing foreign pages, and set up mappings. */
-	priv = kzalloc(sizeof(*priv) + ((vma->vm_end - vma->vm_start)
-					>> PAGE_SHIFT) * sizeof(*priv->map),
-		       GFP_KERNEL);
-	if (priv == NULL) {
+	info->map = kzalloc(((vma->vm_end - vma->vm_start) >>
PAGE_SHIFT) *
+			    sizeof(*info->map), GFP_KERNEL);
+	if (info->map == NULL) {
 		WPRINTK("Couldn''t alloc VM_FOREIGN map.\n");
 		goto fail;
 	}
-	priv->info = info;
 
-	vma->vm_private_data = priv;
+	vma->vm_private_data = info->map;
 	vma->vm_flags |= VM_FOREIGN;
 	vma->vm_flags |= VM_DONTCOPY;
 
@@ -723,7 +742,7 @@ static int blktap_mmap(struct file *filp
 	vma->vm_mm->context.has_foreign_mappings = 1;
 #endif
 
-	info->vma = vma;
+	info->mm = get_task_mm(current);
 	smp_wmb();
 	info->ring_ok = 1;
 	return 0;
@@ -997,6 +1016,24 @@ static void free_req(pending_req_t *req)
 		wake_up(&pending_free_wq);
 }
 
+static void blktap_zap_page_range(struct mm_struct *mm,
+				  unsigned long uvaddr, int nr_pages)
+{
+	unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
+	struct vm_area_struct *vma;
+
+	vma = find_vma(mm, uvaddr);
+	while (vma && uvaddr < end) {
+		unsigned long s = max(uvaddr, vma->vm_start);
+		unsigned long e = min(end, vma->vm_end);
+
+		zap_page_range(vma, s, e - s, NULL);
+
+		uvaddr = e;
+		vma = vma->vm_next;
+	}
+}
+
 static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
 			    int tapidx)
 {
@@ -1017,14 +1054,13 @@ static void fast_flush_area(pending_req_
 		return;
 	}
 
-	mm = info->vma ? info->vma->vm_mm : NULL;
+	mm = info->mm;
 
-	if (info->vma != NULL &&
-	    xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
 		down_write(&mm->mmap_sem);
-		zap_page_range(info->vma, 
-			       MMAP_VADDR(info->user_vstart, u_idx, 0), 
-			       req->nr_pages << PAGE_SHIFT, NULL);
+		blktap_zap_page_range(mm,
+				      MMAP_VADDR(info->user_vstart, u_idx, 0),
+				      req->nr_pages);
 		up_write(&mm->mmap_sem);
 		return;
 	}
@@ -1075,13 +1111,12 @@ static void fast_flush_area(pending_req_
 		GNTTABOP_unmap_grant_ref, unmap, invcount);
 	BUG_ON(ret);
 	
-	if (info->vma != NULL &&
-	    !xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) {
 		if (!locked++)
 			down_write(&mm->mmap_sem);
-		zap_page_range(info->vma, 
-			       MMAP_VADDR(info->user_vstart, u_idx, 0), 
-			       req->nr_pages << PAGE_SHIFT, NULL);
+		blktap_zap_page_range(mm, 
+				      MMAP_VADDR(info->user_vstart, u_idx, 0), 
+				      req->nr_pages);
 	}
 
 	if (locked)
@@ -1195,7 +1230,6 @@ static int blktap_read_ufe_ring(tap_blki
 		for (j = 0; j < pending_req->nr_pages; j++) {
 
 			unsigned long kvaddr, uvaddr;
-			struct tap_vma_priv *priv = info->vma->vm_private_data;
 			struct page *pg;
 			int offset;
 
@@ -1205,7 +1239,7 @@ static int blktap_read_ufe_ring(tap_blki
 			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
 			ClearPageReserved(pg);
 			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
-			priv->map[offset] = NULL;
+			info->map[offset] = NULL;
 		}
 		fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
 		info->idx_map[usr_idx] = INVALID_REQ;
@@ -1267,7 +1301,8 @@ static int do_block_io_op(blkif_t *blkif
 
 	info = tapfds[blkif->dev_num];
 
-	if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
+	if (blkif->dev_num > MAX_TAP_DEV || !info ||
+	    !test_bit(0, &info->dev_inuse)) {
 		if (print_dbug) {
 			WPRINTK("Can''t get UE info!\n");
 			print_dbug = 0;
@@ -1363,12 +1398,12 @@ static void dispatch_rw_block_io(blkif_t
 	unsigned int nseg;
 	int ret, i, nr_sects = 0;
 	tap_blkif_t *info;
-	struct tap_vma_priv *priv;
 	blkif_request_t *target;
 	int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
 	int usr_idx;
 	uint16_t mmap_idx = pending_req->mem_idx;
 	struct mm_struct *mm;
+	struct vm_area_struct *vma = NULL;
 
 	if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
 		goto fail_response;
@@ -1413,8 +1448,7 @@ static void dispatch_rw_block_io(blkif_t
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
 	op = 0;
-	priv = info->vma->vm_private_data;
-	mm = info->vma->vm_mm;
+	mm = info->mm;
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		down_write(&mm->mmap_sem);
 	for (i = 0; i < nseg; i++) {
@@ -1497,7 +1531,7 @@ static void dispatch_rw_block_io(blkif_t
 							  >> PAGE_SHIFT));
 			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
 			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
-			priv->map[offset] = pg;
+			info->map[offset] = pg;
 		}
 	} else {
 		for (i = 0; i < nseg; i++) {
@@ -1524,7 +1558,7 @@ static void dispatch_rw_block_io(blkif_t
 
 			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
 			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
-			priv->map[offset] = pg;
+			info->map[offset] = pg;
 		}
 	}
 
@@ -1542,9 +1576,23 @@ static void dispatch_rw_block_io(blkif_t
 		pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
 		SetPageReserved(pg);
 		if (xen_feature(XENFEAT_auto_translated_physmap)) {
-			ret = vm_insert_page(info->vma,
-					     MMAP_VADDR(info->user_vstart,
-							usr_idx, i), pg);
+			unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
+							  usr_idx, i);
+			if (vma && uvaddr >= vma->vm_end) {
+				vma = vma->vm_next;
+				if (vma &&
+				    (uvaddr < vma->vm_start ||
+				     uvaddr >= vma->vm_end))
+					vma = NULL;
+			}
+			if (vma == NULL) {
+				vma = find_vma(mm, uvaddr);
+				/* this virtual area was already munmapped.
+				   so skip to next page */
+				if (!vma)
+					continue;
+			}
+			ret = vm_insert_page(vma, uvaddr, pg);
 			if (ret) {
 				up_write(&mm->mmap_sem);
 				goto fail_flush;
_______________________________________________
Xen-ia64-devel mailing list
Xen-ia64-devel@lists.xensource.com
http://lists.xensource.com/xen-ia64-devel