Hi namojainashis...,
For converting PDF and Word file to XML i am using SautinSoft .Net library.
For more details refer below links.
https://www.sautinsoft.com/products/pdf-focus/convert-pdf-to-xml-document-in-dotnet.php
https://www.sautinsoft.com/products/useoffice/examples/convert-docx-to-xml-csharp-vb-net.php
You can install it from Nuget Package Manager console.
Search for SautinSoft.PdfFocus and SautinSoft.UseOffice and install it in your project.
Then use the below code to convert to XML file.
HTML
<asp:FileUpload ID="fuUpload" runat="server" />
<asp:Button Text="Convert To XML" runat="server" OnClick="ConvertFile" />
Namespaces
C#
using System.IO;
VB.Net
Imports System.IO
Code
C#
protected void ConvertFile(object sender, EventArgs e)
{
string filePath = "";
string fileExtension = "";
if (fuUpload.HasFile)
{
fileExtension = Path.GetExtension(fuUpload.FileName);
filePath = Server.MapPath("~/Uploads/" + fuUpload.FileName);
fuUpload.PostedFile.SaveAs(filePath);
if (fileExtension.ToLower().Trim() == ".pdf")
{
string pathToXml = Path.ChangeExtension(filePath, ".xml");
// Convert PDF file to XML file.
SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
// Let's convert only tables to XML and skip all textual data.
f.XmlOptions.ConvertNonTabularDataToSpreadsheet = true;
f.OpenPdf(filePath);
if (f.PageCount > 0)
{
f.ToXml(pathToXml);
}
}
else if (fileExtension.ToLower().Trim() == ".doc" || fileExtension.ToLower().Trim() == ".docx")
{
SautinSoft.UseOffice u = new SautinSoft.UseOffice();
// Prepare UseOffice .Net, loads MS Word in memory.
int ret = u.InitWord();
if (ret == 1)
{
return;
}
string outFile = Server.MapPath("~/Uploads/" + Path.GetFileNameWithoutExtension(fuUpload.FileName) + ".xml");
// Converting
ret = u.ConvertFile(filePath, outFile, SautinSoft.UseOffice.eDirection.DOCX_to_XML);
// Release MS Word from memory
u.CloseWord();
}
}
}
VB.Net
Protected Sub ConvertFile(ByVal sender As Object, ByVal e As EventArgs)
Dim filePath As String = ""
Dim fileExtension As String = ""
If fuUpload.HasFile Then
fileExtension = Path.GetExtension(fuUpload.FileName)
filePath = Server.MapPath("~/Uploads/" & fuUpload.FileName)
fuUpload.PostedFile.SaveAs(filePath)
If fileExtension.ToLower().Trim() = ".pdf" Then
Dim pathToXml As String = Path.ChangeExtension(filePath, ".xml")
Dim f As SautinSoft.PdfFocus = New SautinSoft.PdfFocus()
f.XmlOptions.ConvertNonTabularDataToSpreadsheet = True
f.OpenPdf(filePath)
If f.PageCount > 0 Then
f.ToXml(pathToXml)
End If
ElseIf fileExtension.ToLower().Trim() = ".doc" OrElse fileExtension.ToLower().Trim() = ".docx" Then
Dim u As SautinSoft.UseOffice = New SautinSoft.UseOffice()
Dim ret As Integer = u.InitWord()
If ret = 1 Then
Return
End If
Dim outFile As String = Server.MapPath("~/Uploads/" & Path.GetFileNameWithoutExtension(fuUpload.FileName) & ".xml")
ret = u.ConvertFile(filePath, outFile, SautinSoft.UseOffice.eDirection.DOCX_to_XML)
u.CloseWord()
End If
End If
End Sub
For uploading and downloading file to FTP server refer below articles.