Information Technology - Computer Programming - Source Code - Homebrew - Open Source - Software - Hardware - 8 bit - 16 bit - 32 bit - 64 bit - x86 - x64 - DOS - Windows - Linux - Arduino - Embedded - Development - Retro - Vintage - Math - Science - History - Hobby - Beginners - Professionals - Experiment - Research - Study - Fun - Games

HTM Parser

Share your Express Basic creations here.
Post Reply
admin
Site Admin
Posts: 127
Joined: Wed Feb 22, 2023 6:51 am

HTM Parser

Post by admin »

HTM Parser

Converts HTM/HTML files to plain text.

Code: Select all

1 REM HTM Parser by Gemino Smothers
2 REM Written in Express BASIC
10 lf$ = CHR$(10): INPUT "*.htm: ", htmfile$: GOSUB 1000: END
1000 PRINT "Parsing: "; htmfile$
1010 OPEN "I", #1, htmfile$
1020 OPEN "O", #2, "parsed.txt"
1030 IF EOF(1) THEN 1080
1040 returnbuffer$ = "": INPUT #1, rawhtm$
1050 FOR chars = 1 TO LEN(rawhtm$): GOSUB 2000: NEXT chars
1060 PRINT returnbuffer$: PRINT #2, returnbuffer$
1070 IF EOF(1) = 0 THEN 1030
1080 CLOSE #1: CLOSE #2
1090 PRINT "Done.": RETURN
2000 tagchars$ = UCASE$(MID$(rawhtm$, chars, 3)): PRINT tagchars$
2010 IF tagchars$ = "<BO" THEN GOSUB 3000: GOTO 2100
2020 IF tagchars$ = "<BR" OR tagchars$ = "<HR" OR tagchars$ = "<P>" OR tagchars$ = "</P" THEN GOSUB 4000: GOTO 2100
2030 IF tagchars$ = "<DI" OR tagchars$ = "</D" OR tagchars$ = "<TA" OR tagchars$ = "</T" THEN GOSUB 4000: GOTO 2100
2040 IF tagchars$ = "<TR" OR tagchars$ = "<TD" OR tagchars$ = "<TH" THEN GOSUB 4000: GOTO 2100
2050 IF tagchars$ = "<SC" OR tagchars$ = "<ST" THEN GOSUB 5000: GOTO 2100
2060 IF tagchars$ = "</S" THEN GOSUB 6000: GOTO 2100
2070 IF tagchars$ = "<A " THEN GOSUB 7000: GOTO 2100
2080 IF tagchars$ = "&NB" THEN GOSUB 8000: GOTO 2100
2090 GOSUB 9000
2100 RETURN
3000 body = 1: GOSUB 10000: RETURN
4000 IF body THEN returnbuffer$ = returnbuffer$ + lf$: GOSUB 10000
4010 RETURN
5000 body = 0: RETURN
6000 body = 1: GOSUB 10000: RETURN
7000 returnbuffer$ = returnbuffer$ + lf$ + "LINK: (": quote = 0
7010 IF MID$(rawhtm$, chars, 1) <> CHR$(34) THEN 7040
7020 IF quote = 0 THEN quote = 1: chars = chars + 1: GOTO 7040
7030 quote = 0
7040 IF quote THEN returnbuffer$ = returnbuffer$ + MID$(rawhtm$, chars, 1)
7050 chars = chars + 1
7060 IF MID$(rawhtm$, chars, 1) <> ">" AND chars < LEN(rawhtm$) THEN 7010
7070 returnbuffer$ = returnbuffer$ + ") ": RETURN
8000 returnbuffer$ = returnbuffer$ + " ": targetchar$ = ";": GOSUB 11000: RETURN
9000 IF body = 0 THEN 9040
9010 targetchar$ = MID$(rawhtm$, chars, 1)
9020 IF MID$(rawhtm$, chars, 1) = "<" THEN targetchar$ = MID$(rawhtm$, chars + 1, 1): GOSUB 12000: GOTO 9040
9030 targetchar$ = MID$(rawhtm$, chars, 1): GOSUB 13000
9040 RETURN
10000 targetchar$ = ">": GOSUB 11000: RETURN
11000 checkchars = chars
11010 checkchars = checkchars + 1
11020 IF MID$(rawhtm$, checkchars, 1) <> targetchar$ AND checkchars < LEN(rawhtm$) THEN 11010
11030 IF MID$(rawhtm$, checkchars, 1) = targetchar$ THEN chars = checkchars
11040 RETURN
12000 IF LEN(targetchar$) THEN achar = ASC(targetchar$)
12010 IF (achar < 48 OR achar > 57) AND achar <> 32 AND achar <> 44 AND achar <> 60 THEN GOSUB 10000: RETURN
12020 returnbuffer$ = returnbuffer$ + MID$(rawhtm$, chars, 1): RETURN
13000 fachar = ASC(targetchar$)
13010 IF (fachar < 9 OR fachar > 11) AND fachar <> 13 THEN returnbuffer$ = returnbuffer$ + targetchar$
13020 RETURN
Post Reply