c808a00b by Ean Schuessler

Ergonomic improvements for LLM discovery: tiered discovery tools, semantic namin…

…g, and recursive screen resolution
1 parent 5745c06b
1 arguments=--init-script /home/ean/.config/Code/User/globalStorage/redhat.java/1.50.0/config_linux/org.eclipse.osgi/58/0/.cp/gradle/init/init.gradle --init-script /home/ean/.config/Code/User/globalStorage/redhat.java/1.50.0/config_linux/org.eclipse.osgi/58/0/.cp/gradle/protobuf/init.gradle 1 arguments=--init-script /home/ean/.local/share/opencode/bin/jdtls/config_linux/org.eclipse.osgi/58/0/.cp/gradle/init/init.gradle
2 auto.sync=false 2 auto.sync=false
3 build.scans.enabled=false 3 build.scans.enabled=false
4 connection.gradle.distribution=GRADLE_DISTRIBUTION(VERSION(8.9)) 4 connection.gradle.distribution=GRADLE_DISTRIBUTION(VERSION(8.9))
5 connection.project.dir=../../../framework 5 connection.project.dir=
6 eclipse.preferences.version=1 6 eclipse.preferences.version=1
7 gradle.user.home= 7 gradle.user.home=
8 java.home=/usr/lib/jvm/java-17-openjdk-amd64 8 java.home=/usr/lib/jvm/java-21-openjdk-amd64
9 jvm.arguments= 9 jvm.arguments=
10 offline.mode=false 10 offline.mode=false
11 override.workspace.settings=true 11 override.workspace.settings=true
......
1 # YouTube Video Content Extractor using Jina.ai
2
3 This script demonstrates how to extract YouTube video content using jina.ai summarizer, bypassing YouTube's restrictions on automated access.
4
5 ## How It Works
6
7 Jina.ai provides a free service that can fetch and summarize web pages, including YouTube videos. When you append a YouTube URL to `https://r.jina.ai/http://`, it:
8
9 1. Fetches the YouTube page content
10 2. Extracts video metadata, description, and available text content
11 3. Returns it in clean markdown format
12 4. Bypasses YouTube's JavaScript requirements and bot detection
13
14 ## Usage Examples
15
16 ### Basic Usage
17 ```bash
18 # Extract video content
19 curl "https://r.jina.ai/http://www.youtube.com/watch?v=VIDEO_ID"
20
21 # Example with your video
22 curl "https://r.jina.ai/http://www.youtube.com/watch?v=Tauucda-NV4"
23 ```
24
25 ### Python Script
26 ```python
27 import requests
28 import json
29 import sys
30
31 def extract_youtube_content(video_url):
32 """Extract YouTube video content using jina.ai"""
33
34 # Remove any existing protocol and add jina.ai prefix
35 clean_url = video_url.replace("https://", "").replace("http://", "")
36 jina_url = f"https://r.jina.ai/http://{clean_url}"
37
38 try:
39 response = requests.get(jina_url, timeout=30)
40 response.raise_for_status()
41
42 # Parse the markdown content
43 content = response.text
44
45 # Extract key information
46 title = extract_title(content)
47 description = extract_description(content)
48 views = extract_views(content)
49
50 return {
51 'title': title,
52 'description': description,
53 'views': views,
54 'full_content': content
55 }
56
57 except Exception as e:
58 return {'error': str(e)}
59
60 def extract_title(content):
61 """Extract video title from content"""
62 lines = content.split('\n')
63 for line in lines:
64 if line.strip().startswith('# ') and 'YouTube' not in line:
65 return line.strip('# ').strip()
66 return "Unknown"
67
68 def extract_description(content):
69 """Extract video description"""
70 lines = content.split('\n')
71 desc_start = False
72 description = []
73
74 for line in lines:
75 if 'Description' in line:
76 desc_start = True
77 continue
78 elif desc_start and line.strip():
79 if line.startswith('### ') or line.startswith('['):
80 break
81 description.append(line.strip())
82
83 return '\n'.join(description)
84
85 def extract_views(content):
86 """Extract view count"""
87 import re
88 views_match = re.search(r'(\d+)\s*views', content)
89 return views_match.group(1) if views_match else "Unknown"
90
91 # Usage
92 if __name__ == "__main__":
93 if len(sys.argv) != 2:
94 print("Usage: python youtube_extractor.py <youtube_url>")
95 sys.exit(1)
96
97 video_url = sys.argv[1]
98 result = extract_youtube_content(video_url)
99
100 if 'error' in result:
101 print(f"Error: {result['error']}")
102 else:
103 print(f"Title: {result['title']}")
104 print(f"Views: {result['views']}")
105 print(f"Description:\n{result['description']}")
106 ```
107
108 ### Shell Script
109 ```bash
110 #!/bin/bash
111
112 # YouTube Content Extractor using Jina.ai
113 # Usage: ./extract_youtube.sh <youtube_url>
114
115 if [ $# -eq 0 ]; then
116 echo "Usage: $0 <youtube_url>"
117 exit 1
118 fi
119
120 YOUTUBE_URL="$1"
121 JINA_URL="https://r.jina.ai/http://${YOUTUBE_URL#https://}"
122
123 echo "Extracting content from: $YOUTUBE_URL"
124 echo "========================================"
125
126 curl -s "$JINA_URL" | \
127 sed -n '/Description:/,$p' | \
128 head -n -1
129
130 echo "========================================"
131 ```
132
133 ## Advanced Usage for MCP Integration
134
135 ### Integration with AI Analysis
136 ```python
137 def analyze_video_with_ai(video_url, ai_client):
138 """Extract video content and analyze with AI"""
139
140 # Extract content
141 content = extract_youtube_content(video_url)
142
143 if 'error' in content:
144 return content
145
146 # Prepare analysis prompt
147 prompt = f"""
148 Analyze this YouTube video content:
149
150 Title: {content['title']}
151 Description: {content['description']}
152 Views: {content['views']}
153
154 Provide insights on:
155 1. Main topic/subject
156 2. Key actions demonstrated
157 3. Technical details shown
158 4. Notable results or outcomes
159 """
160
161 # Send to AI for analysis
162 analysis = ai_client.generate(prompt)
163
164 return {
165 'video_data': content,
166 'analysis': analysis
167 }
168 ```
169
170 ### Batch Processing
171 ```python
172 def process_video_list(video_urls):
173 """Process multiple YouTube videos"""
174 results = []
175
176 for url in video_urls:
177 print(f"Processing: {url}")
178 result = extract_youtube_content(url)
179 results.append(result)
180
181 # Rate limiting
182 time.sleep(1)
183
184 return results
185
186 # Example usage
187 video_urls = [
188 "https://www.youtube.com/watch?v=Tauucda-NV4",
189 "https://www.youtube.com/watch?v=ANOTHER_VIDEO_ID"
190 ]
191
192 results = process_video_list(video_urls)
193 ```
194
195 ## Integration with Moqui MCP
196
197 ### MCP Service for Video Analysis
198 ```xml
199 <service verb="analyze" noun="YouTubeVideo" authenticate="true">
200 <description>Analyze YouTube video content using jina.ai</description>
201 <in-parameters>
202 <parameter name="videoUrl" required="true" type="String"/>
203 </in-parameters>
204 <out-parameters>
205 <parameter name="analysis" type="Map"/>
206 </out-parameters>
207 <actions>
208 <script><![CDATA[
209 import groovy.json.JsonSlurper
210 import groovy.json.JsonBuilder
211
212 // Extract content using jina.ai
213 def cleanUrl = videoUrl.replace("https://", "").replace("http://", "")
214 def jinaUrl = "https://r.jina.ai/http://${cleanUrl}"
215
216 def connection = new URL(jinaUrl).openConnection()
217 def response = connection.inputStream.text
218
219 // Parse response (simplified)
220 def lines = response.split('\n')
221 def title = "Unknown"
222 def description = ""
223
224 for (line in lines) {
225 if (line.startsWith('# ') && title == "Unknown") {
226 title = line.replace('# ', '').replace(' - YouTube', '').trim()
227 }
228 if (line.contains('Description:')) {
229 // Start collecting description
230 def descStart = lines.indexOf(line) + 1
231 description = lines[descStart..-1].join('\n').trim()
232 break
233 }
234 }
235
236 analysis = [
237 title: title,
238 description: description,
239 extractedAt: ec.user.nowTimestamp,
240 source: 'jina.ai'
241 ]
242 ]]></script>
243 </actions>
244 </service>
245 ```
246
247 ## Limitations and Considerations
248
249 ### What Jina.ai Extracts
250 - ✅ Video title and metadata
251 - ✅ Video description text
252 - ✅ View count and upload date
253 - ✅ Channel information
254 - ✅ Related videos (titles only)
255
256 ### What It Doesn't Extract
257 - ❌ Actual video content/transcript
258 - ❌ Audio from video
259 - ❌ Visual frames or screenshots
260 - ❌ Comments (requires login)
261
262 ### Rate Limiting
263 - Jina.ai is a free service - implement rate limiting
264 - Add delays between requests
265 - Cache results when possible
266
267 ### Error Handling
268 - Check for 403 errors (private/deleted videos)
269 - Handle network timeouts
270 - Validate YouTube URL format
271
272 ## Security and Privacy
273
274 ### Data Handling
275 - Only processes publicly available YouTube metadata
276 - No authentication required
277 - Content is extracted via third-party service
278
279 ### Usage Guidelines
280 - Respect YouTube's Terms of Service
281 - Don't circumvent paywalls or private content
282 - Use for legitimate research/analysis purposes
283
284 ## Alternative Services
285
286 If jina.ai is unavailable, similar services include:
287 - `https://r.jina.ai/http://URL` (primary)
288 - `https://r.jina.ai/http://URL&format=json` (JSON format)
289 - Custom scrapers (more complex)
290
291 ## Troubleshooting
292
293 ### Common Issues
294 1. **403 Errors**: Video is private or deleted
295 2. **Empty Content**: Video has no description
296 3. **Rate Limiting**: Too many requests too quickly
297 4. **Network Issues**: Connection timeouts
298
299 ### Debug Mode
300 ```python
301 def debug_extract(video_url):
302 """Debug version with detailed logging"""
303 print(f"Original URL: {video_url}")
304
305 clean_url = video_url.replace("https://", "").replace("http://", "")
306 jina_url = f"https://r.jina.ai/http://{clean_url}"
307
308 print(f"Jina URL: {jina_url}")
309
310 try:
311 response = requests.get(jina_url, timeout=30)
312 print(f"Status Code: {response.status_code}")
313 print(f"Content Length: {len(response.text)}")
314
315 if response.status_code == 200:
316 print("✅ Success!")
317 else:
318 print("❌ Failed!")
319
320 except Exception as e:
321 print(f"❌ Exception: {e}")
322 ```
323
324 This approach turns the jina.ai trick into a reusable skill for extracting YouTube video metadata and descriptions for analysis, documentation, or integration with other systems.
...\ No newline at end of file ...\ No newline at end of file
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
4 "moqui_mcp": { 4 "moqui_mcp": {
5 "type": "remote", 5 "type": "remote",
6 "url": "http://localhost:8080/mcp", 6 "url": "http://localhost:8080/mcp",
7 "enabled": false, 7 "enabled": true,
8 "headers": { 8 "headers": {
9 "Authorization": "Basic am9obi5zYWxlczptb3F1aQ==" 9 "Authorization": "Basic am9obi5zYWxlczptb3F1aQ=="
10 } 10 }
11 } 11 }
12 } 12 }
13 }
...\ No newline at end of file ...\ No newline at end of file
13 }
......