BoldSign®Effortlessly integrate e-signatures into your app with the BoldSign® API. Create a sandbox account!
private bool IsBlankPage(PdfLoadedPage lpage)
{
bool isBlankPage = false;
//Extract images
Image[] images = lpage.ExtractImages();
if (images.Length > 0)
{
foreach (Image img in images)
{
if (!PerformOCR(img as Bitmap))
{
isBlankPage = false;
break;
}
else
isBlankPage = true;
}
}
else
{
isBlankPage = true;
}
return isBlankPage;
} |
private bool PerformOCR(Bitmap img)
{
bool empty = false;
//Create a new OCR processor
using (OCRProcessor processor = new OCRProcessor(tesseractBinariesPath))
{
//Set language.
processor.Settings.Language = Languages.English;
//perform OCR
string text = processor.PerformOCR(img,tessdataPath);
if(text == null || text == string.Empty )
{
empty = true;
}
}
return empty;
} |
Hi.
Thanks for the answer.
A question about your answer:
If the image doesn't include text (eg a picture), PerformOCR will return empty value and we will remove a good page (not only the empty one).
private bool IsEmptyImage(Bitmap image)
{
bool isEmpty = true;
int blackPixelCount = 0;
//Suspect 25% of image have black pixels then it is not an empty image.
int blackPixelRange = ((image.Width * image.Height) / 100) * 25;
for (int i = 0; i < image.Width; i++)
{
for (int j = 0; j < image.Height; j++)
{
Color color = image.GetPixel(i, j);
if (color.R == 255 && color.G == 255 && color.B == 255)
{
//Skip the white pixels
}
else if (color.R == 0 && color.G == 0 && color.B == 0)
{
//Get the black pixels count
blackPixelCount++;
}
else
{
//Colored pixels
isEmpty = false;
break;
}
if (blackPixelCount >= blackPixelRange)
{
isEmpty = false;
break;
}
}
if (!isEmpty)
break;
}
return isEmpty;
}
|
Hi. Thanks for your help.
Your solution works well but it is very slow. It takes 1 minute to work a 42 pages pdf file.
private bool IsEmpty(Bitmap image)
{
Rectangle bounds = new Rectangle(0, 0, image.Width, image.Height);
BitmapData bmpData = image.LockBits(bounds, ImageLockMode.ReadWrite, image.PixelFormat);
IntPtr ptr = bmpData.Scan0;
int bytes = Math.Abs(bmpData.Stride) * image.Height;
byte[] rgbValues = new byte[bytes];
// Copy the RGB values into the array.
Marshal.Copy(ptr, rgbValues, 0, bytes);
// Unlock the bits.
image.UnlockBits(bmpData);
//Suspect 25% of image have black pixels then it is not an empty image.
int blackPixelRange = ((image.Width * image.Height) / 100) * 25;
//Get the white pixels count
int whitePixelsCount = Enumerable.Range(0, rgbValues.Length).Where(i => rgbValues[i] == 255).ToList().Count;
//Get the black pixels count
int blackPixelsCount = Enumerable.Range(0, rgbValues.Length).Where(i => rgbValues[i] == 0).ToList().Count;
if ((blackPixelsCount + whitePixelsCount) != rgbValues.Length)
return false;
else if (blackPixelsCount >= blackPixelRange)
return false;
else
return true;
}
|
Hi,
If a page contains only watermark or a shape or a textbox, the above logic would consider it as blank since it does not detect any images or text. How do we handle such a scenario to return truly blank page ?
Attached are sample pdfs in which page is wrongly detected as blank. Please run this through your code and let us know the best approach.
Regards,
Rohit
Hi Rohit Pitre,
We have modified the sample based on your requirements. Kindly try the attached sample on your end and let us know the result.
Please find the sample below:
Regards,
Santhiya.