diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b7fa2f599459b67cb27586bea9869d83a033986f..c848095f2cb0fc7d8505389855ee9e26a88889a4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3023,6 +3023,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	S		[KNL] Run init in single mode
 
+	s390_iommu=	[HW,S390]
+			Set s390 IOTLB flushing mode
+		strict
+			With strict flushing every unmap operation will result in
+			an IOTLB flush. Default is lazy flushing before reuse,
+			which is faster.
+
 	sa1100ir	[NET]
 			See drivers/net/irda/sa1100_ir.c.
 
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index f91c03119804751894c39433eb52da1844dbcf3b..4cbb29a4d6152238c771d5f1936b9d9d15b94e68 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -16,6 +16,13 @@
 
 static struct kmem_cache *dma_region_table_cache;
 static struct kmem_cache *dma_page_table_cache;
+static int s390_iommu_strict;
+
+static int zpci_refresh_global(struct zpci_dev *zdev)
+{
+	return zpci_refresh_trans((u64) zdev->fh << 32, zdev->start_dma,
+				  zdev->iommu_pages * PAGE_SIZE);
+}
 
 static unsigned long *dma_alloc_cpu_table(void)
 {
@@ -155,18 +162,15 @@ static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
 	}
 
 	/*
-	 * rpcit is not required to establish new translations when previously
-	 * invalid translation-table entries are validated, however it is
-	 * required when altering previously valid entries.
+	 * With zdev->tlb_refresh == 0, rpcit is not required to establish new
+	 * translations when previously invalid translation-table entries are
+	 * validated. With lazy unmap, it also is skipped for previously valid
+	 * entries, but a global rpcit is then required before any address can
+	 * be re-used, i.e. after each iommu bitmap wrap-around.
 	 */
 	if (!zdev->tlb_refresh &&
-	    ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))
-		/*
-		 * TODO: also need to check that the old entry is indeed INVALID
-		 * and not only for one page but for the whole range...
-		 * -> now we WARN_ON in that case but with lazy unmap that
-		 * needs to be redone!
-		 */
+			(!s390_iommu_strict ||
+			((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)))
 		goto no_refresh;
 
 	rc = zpci_refresh_trans((u64) zdev->fh << 32, start_dma_addr,
@@ -220,16 +224,21 @@ static unsigned long __dma_alloc_iommu(struct zpci_dev *zdev,
 static unsigned long dma_alloc_iommu(struct zpci_dev *zdev, int size)
 {
 	unsigned long offset, flags;
+	int wrap = 0;
 
 	spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
 	offset = __dma_alloc_iommu(zdev, zdev->next_bit, size);
-	if (offset == -1)
+	if (offset == -1) {
+		/* wrap-around */
 		offset = __dma_alloc_iommu(zdev, 0, size);
+		wrap = 1;
+	}
 
 	if (offset != -1) {
 		zdev->next_bit = offset + size;
-		if (zdev->next_bit >= zdev->iommu_pages)
-			zdev->next_bit = 0;
+		if (!zdev->tlb_refresh && !s390_iommu_strict && wrap)
+			/* global flush after wrap-around with lazy unmap */
+			zpci_refresh_global(zdev);
 	}
 	spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
 	return offset;
@@ -243,7 +252,11 @@ static void dma_free_iommu(struct zpci_dev *zdev, unsigned long offset, int size
 	if (!zdev->iommu_bitmap)
 		goto out;
 	bitmap_clear(zdev->iommu_bitmap, offset, size);
-	if (offset >= zdev->next_bit)
+	/*
+	 * Lazy flush for unmap: need to move next_bit to avoid address re-use
+	 * until wrap-around.
+	 */
+	if (!s390_iommu_strict && offset >= zdev->next_bit)
 		zdev->next_bit = offset + size;
 out:
 	spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
@@ -504,3 +517,12 @@ struct dma_map_ops s390_dma_ops = {
 	/* dma_supported is unconditionally true without a callback */
 };
 EXPORT_SYMBOL_GPL(s390_dma_ops);
+
+static int __init s390_iommu_setup(char *str)
+{
+	if (!strncmp(str, "strict", 6))
+		s390_iommu_strict = 1;
+	return 0;
+}
+
+__setup("s390_iommu=", s390_iommu_setup);